diff options
Diffstat (limited to 'fs')
431 files changed, 15805 insertions, 7380 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 1964f98e74b..b85efa77394 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -594,21 +594,21 @@ static int __init init_v9fs(void)  	int err;  	pr_info("Installing v9fs 9p2000 file system support\n");  	/* TODO: Setup list of registered trasnport modules */ -	err = register_filesystem(&v9fs_fs_type); -	if (err < 0) { -		pr_err("Failed to register filesystem\n"); -		return err; -	}  	err = v9fs_cache_register();  	if (err < 0) {  		pr_err("Failed to register v9fs for caching\n"); -		goto out_fs_unreg; +		return err;  	}  	err = v9fs_sysfs_init();  	if (err < 0) {  		pr_err("Failed to register with sysfs\n"); +		goto out_cache; +	} +	err = register_filesystem(&v9fs_fs_type); +	if (err < 0) { +		pr_err("Failed to register filesystem\n");  		goto out_sysfs_cleanup;  	} @@ -617,8 +617,8 @@ static int __init init_v9fs(void)  out_sysfs_cleanup:  	v9fs_sysfs_cleanup(); -out_fs_unreg: -	unregister_filesystem(&v9fs_fs_type); +out_cache: +	v9fs_cache_unregister();  	return err;  } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 7b0cd87b07c..8c92a9ba833 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -155,9 +155,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,  		goto release_sb;  	} -	root = d_alloc_root(inode); +	root = d_make_root(inode);  	if (!root) { -		iput(inode);  		retval = -ENOMEM;  		goto release_sb;  	} @@ -260,7 +259,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)  	if (v9fs_proto_dotl(v9ses)) {  		res = p9_client_statfs(fid, &rs);  		if (res == 0) { -			buf->f_type = V9FS_MAGIC; +			buf->f_type = rs.type;  			buf->f_bsize = rs.bsize;  			buf->f_blocks = rs.blocks;  			buf->f_bfree = rs.bfree; diff --git a/fs/Kconfig b/fs/Kconfig index d621f02a3f9..f95ae3a027f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -4,6 +4,10 @@  menu "File systems" +# Use unaligned word dcache accesses +config DCACHE_WORD_ACCESS +       bool +  if BLOCK  source "fs/ext2/Kconfig" @@ -210,6 +214,7 @@ source "fs/minix/Kconfig"  source "fs/omfs/Kconfig"  source "fs/hpfs/Kconfig"  source "fs/qnx4/Kconfig" +source "fs/qnx6/Kconfig"  source "fs/romfs/Kconfig"  source "fs/pstore/Kconfig"  source "fs/sysv/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 93804d4d66e..2fb97793467 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_UBIFS_FS)		+= ubifs/  obj-$(CONFIG_AFFS_FS)		+= affs/  obj-$(CONFIG_ROMFS_FS)		+= romfs/  obj-$(CONFIG_QNX4FS_FS)		+= qnx4/ +obj-$(CONFIG_QNX6FS_FS)		+= qnx6/  obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/  obj-$(CONFIG_ADFS_FS)		+= adfs/  obj-$(CONFIG_FUSE_FS)		+= fuse/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 8e3b36ace30..06fdcc9382c 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -483,10 +483,9 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)  	sb->s_d_op = &adfs_dentry_operations;  	root = adfs_iget(sb, &root_obj); -	sb->s_root = d_alloc_root(root); +	sb->s_root = d_make_root(root);  	if (!sb->s_root) {  		int i; -		iput(root);  		for (i = 0; i < asb->s_map_size; i++)  			brelse(asb->s_map[i].dm_bh);  		kfree(asb->s_map); diff --git a/fs/affs/super.c b/fs/affs/super.c index 8ba73fed796..0782653a05a 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -473,7 +473,7 @@ got_root:  	root_inode = affs_iget(sb, root_block);  	if (IS_ERR(root_inode)) {  		ret = PTR_ERR(root_inode); -		goto out_error_noinode; +		goto out_error;  	}  	if (AFFS_SB(sb)->s_flags & SF_INTL) @@ -481,7 +481,7 @@ got_root:  	else  		sb->s_d_op = &affs_dentry_operations; -	sb->s_root = d_alloc_root(root_inode); +	sb->s_root = d_make_root(root_inode);  	if (!sb->s_root) {  		printk(KERN_ERR "AFFS: Get root inode failed\n");  		goto out_error; @@ -494,9 +494,6 @@ got_root:  	 * Begin the cascaded cleanup ...  	 */  out_error: -	if (root_inode) -		iput(root_inode); -out_error_noinode:  	kfree(sbi->s_bitmap);  	affs_brelse(root_bh);  	kfree(sbi->s_prefix); diff --git a/fs/afs/file.c b/fs/afs/file.c index 14d89fa58fe..8f6e9234d56 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -251,7 +251,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,  	ASSERT(key != NULL);  	vnode = AFS_FS_I(mapping->host); -	if (vnode->flags & AFS_VNODE_DELETED) { +	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {  		_leave(" = -ESTALE");  		return -ESTALE;  	} diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 2f213d109c2..b960ff05ea0 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -365,10 +365,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,  		_debug("extract data");  		if (call->count > 0) {  			page = call->reply3; -			buffer = kmap_atomic(page, KM_USER0); +			buffer = kmap_atomic(page);  			ret = afs_extract_data(call, skb, last, buffer,  					       call->count); -			kunmap_atomic(buffer, KM_USER0); +			kunmap_atomic(buffer);  			switch (ret) {  			case 0:		break;  			case -EAGAIN:	return 0; @@ -411,9 +411,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,  	if (call->count < PAGE_SIZE) {  		_debug("clear");  		page = call->reply3; -		buffer = kmap_atomic(page, KM_USER0); +		buffer = kmap_atomic(page);  		memset(buffer + call->count, 0, PAGE_SIZE - call->count); -		kunmap_atomic(buffer, KM_USER0); +		kunmap_atomic(buffer);  	}  	_leave(" = 0 [done]"); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index d2b0888126d..a306bb6d88d 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -109,7 +109,7 @@ struct afs_call {  	unsigned		reply_size;	/* current size of reply */  	unsigned		first_offset;	/* offset into mapping[first] */  	unsigned		last_to;	/* amount of mapping[last] */ -	unsigned short		offset;		/* offset into received data store */ +	unsigned		offset;		/* offset into received data store */  	unsigned char		unmarshall;	/* unmarshalling phase */  	bool			incoming;	/* T if incoming call */  	bool			send_pages;	/* T if data from mapping should be sent */ diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 8f4ce2658b7..298cf8919ec 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -200,9 +200,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)  		if (PageError(page))  			goto error; -		buf = kmap_atomic(page, KM_USER0); +		buf = kmap_atomic(page);  		memcpy(devname, buf, size); -		kunmap_atomic(buf, KM_USER0); +		kunmap_atomic(buf);  		page_cache_release(page);  		page = NULL;  	} diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index e45a323aebb..8ad8c2a0703 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -314,6 +314,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,  	struct msghdr msg;  	struct kvec iov[1];  	int ret; +	struct sk_buff *skb;  	_enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -380,6 +381,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,  error_do_abort:  	rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); +	while ((skb = skb_dequeue(&call->rx_queue))) +		afs_free_skb(skb);  	rxrpc_kernel_end_call(rxcall);  	call->rxcall = NULL;  error_kill_call: diff --git a/fs/afs/super.c b/fs/afs/super.c index 983ec59fc80..f02b31e7e64 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -301,7 +301,6 @@ static int afs_fill_super(struct super_block *sb,  {  	struct afs_super_info *as = sb->s_fs_info;  	struct afs_fid fid; -	struct dentry *root = NULL;  	struct inode *inode = NULL;  	int ret; @@ -327,18 +326,16 @@ static int afs_fill_super(struct super_block *sb,  		set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);  	ret = -ENOMEM; -	root = d_alloc_root(inode); -	if (!root) +	sb->s_root = d_make_root(inode); +	if (!sb->s_root)  		goto error;  	sb->s_d_op = &afs_fs_dentry_operations; -	sb->s_root = root;  	_leave(" = 0");  	return 0;  error: -	iput(inode);  	_leave(" = %d", ret);  	return ret;  } @@ -13,7 +13,7 @@  #include <linux/errno.h>  #include <linux/time.h>  #include <linux/aio_abi.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/syscalls.h>  #include <linux/backing-dev.h>  #include <linux/uio.h> @@ -160,7 +160,7 @@ static int aio_setup_ring(struct kioctx *ctx)  	info->nr = nr_events;		/* trusted copy */ -	ring = kmap_atomic(info->ring_pages[0], KM_USER0); +	ring = kmap_atomic(info->ring_pages[0]);  	ring->nr = nr_events;	/* user copy */  	ring->id = ctx->user_id;  	ring->head = ring->tail = 0; @@ -168,47 +168,38 @@ static int aio_setup_ring(struct kioctx *ctx)  	ring->compat_features = AIO_RING_COMPAT_FEATURES;  	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;  	ring->header_length = sizeof(struct aio_ring); -	kunmap_atomic(ring, KM_USER0); +	kunmap_atomic(ring);  	return 0;  }  /* aio_ring_event: returns a pointer to the event at the given index from - * kmap_atomic(, km).  Release the pointer with put_aio_ring_event(); + * kmap_atomic().  Release the pointer with put_aio_ring_event();   */  #define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))  #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))  #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({					\ +#define aio_ring_event(info, nr) ({					\  	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\  	struct io_event *__event;					\  	__event = kmap_atomic(						\ -			(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ +			(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \  	__event += pos % AIO_EVENTS_PER_PAGE;				\  	__event;							\  }) -#define put_aio_ring_event(event, km) do {	\ +#define put_aio_ring_event(event) do {		\  	struct io_event *__event = (event);	\  	(void)__event;				\ -	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ +	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \  } while(0)  static void ctx_rcu_free(struct rcu_head *head)  {  	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); -	unsigned nr_events = ctx->max_reqs; -  	kmem_cache_free(kioctx_cachep, ctx); - -	if (nr_events) { -		spin_lock(&aio_nr_lock); -		BUG_ON(aio_nr - nr_events > aio_nr); -		aio_nr -= nr_events; -		spin_unlock(&aio_nr_lock); -	}  }  /* __put_ioctx @@ -217,23 +208,23 @@ static void ctx_rcu_free(struct rcu_head *head)   */  static void __put_ioctx(struct kioctx *ctx)  { +	unsigned nr_events = ctx->max_reqs;  	BUG_ON(ctx->reqs_active); -	cancel_delayed_work(&ctx->wq); -	cancel_work_sync(&ctx->wq.work); +	cancel_delayed_work_sync(&ctx->wq);  	aio_free_ring(ctx);  	mmdrop(ctx->mm);  	ctx->mm = NULL; +	if (nr_events) { +		spin_lock(&aio_nr_lock); +		BUG_ON(aio_nr - nr_events > aio_nr); +		aio_nr -= nr_events; +		spin_unlock(&aio_nr_lock); +	}  	pr_debug("__put_ioctx: freeing %p\n", ctx);  	call_rcu(&ctx->rcu_head, ctx_rcu_free);  } -static inline void get_ioctx(struct kioctx *kioctx) -{ -	BUG_ON(atomic_read(&kioctx->users) <= 0); -	atomic_inc(&kioctx->users); -} -  static inline int try_get_ioctx(struct kioctx *kioctx)  {  	return atomic_inc_not_zero(&kioctx->users); @@ -253,7 +244,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)  {  	struct mm_struct *mm;  	struct kioctx *ctx; -	int did_sync = 0; +	int err = -ENOMEM;  	/* Prevent overflows */  	if ((nr_events > (0x10000000U / sizeof(struct io_event))) || @@ -262,7 +253,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)  		return ERR_PTR(-EINVAL);  	} -	if ((unsigned long)nr_events > aio_max_nr) +	if (!nr_events || (unsigned long)nr_events > aio_max_nr)  		return ERR_PTR(-EAGAIN);  	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); @@ -273,7 +264,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)  	mm = ctx->mm = current->mm;  	atomic_inc(&mm->mm_count); -	atomic_set(&ctx->users, 1); +	atomic_set(&ctx->users, 2);  	spin_lock_init(&ctx->ctx_lock);  	spin_lock_init(&ctx->ring_info.ring_lock);  	init_waitqueue_head(&ctx->wait); @@ -286,25 +277,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)  		goto out_freectx;  	/* limit the number of system wide aios */ -	do { -		spin_lock_bh(&aio_nr_lock); -		if (aio_nr + nr_events > aio_max_nr || -		    aio_nr + nr_events < aio_nr) -			ctx->max_reqs = 0; -		else -			aio_nr += ctx->max_reqs; -		spin_unlock_bh(&aio_nr_lock); -		if (ctx->max_reqs || did_sync) -			break; - -		/* wait for rcu callbacks to have completed before giving up */ -		synchronize_rcu(); -		did_sync = 1; -		ctx->max_reqs = nr_events; -	} while (1); - -	if (ctx->max_reqs == 0) +	spin_lock(&aio_nr_lock); +	if (aio_nr + nr_events > aio_max_nr || +	    aio_nr + nr_events < aio_nr) { +		spin_unlock(&aio_nr_lock);  		goto out_cleanup; +	} +	aio_nr += ctx->max_reqs; +	spin_unlock(&aio_nr_lock);  	/* now link into global list. */  	spin_lock(&mm->ioctx_lock); @@ -316,16 +296,13 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)  	return ctx;  out_cleanup: -	__put_ioctx(ctx); -	return ERR_PTR(-EAGAIN); - +	err = -EAGAIN; +	aio_free_ring(ctx);  out_freectx:  	mmdrop(mm);  	kmem_cache_free(kioctx_cachep, ctx); -	ctx = ERR_PTR(-ENOMEM); - -	dprintk("aio: error allocating ioctx %p\n", ctx); -	return ctx; +	dprintk("aio: error allocating ioctx %d\n", err); +	return ERR_PTR(err);  }  /* aio_cancel_all @@ -413,10 +390,6 @@ void exit_aio(struct mm_struct *mm)  		aio_cancel_all(ctx);  		wait_for_all_aios(ctx); -		/* -		 * Ensure we don't leave the ctx on the aio_wq -		 */ -		cancel_work_sync(&ctx->wq.work);  		if (1 != atomic_read(&ctx->users))  			printk(KERN_DEBUG @@ -490,6 +463,8 @@ static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)  		kmem_cache_free(kiocb_cachep, req);  		ctx->reqs_active--;  	} +	if (unlikely(!ctx->reqs_active && ctx->dead)) +		wake_up_all(&ctx->wait);  	spin_unlock_irq(&ctx->ctx_lock);  } @@ -607,11 +582,16 @@ static void aio_fput_routine(struct work_struct *data)  			fput(req->ki_filp);  		/* Link the iocb into the context's free list */ +		rcu_read_lock();  		spin_lock_irq(&ctx->ctx_lock);  		really_put_req(ctx, req); +		/* +		 * at that point ctx might've been killed, but actual +		 * freeing is RCU'd +		 */  		spin_unlock_irq(&ctx->ctx_lock); +		rcu_read_unlock(); -		put_ioctx(ctx);  		spin_lock_irq(&fput_lock);  	}  	spin_unlock_irq(&fput_lock); @@ -642,7 +622,6 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)  	 * this function will be executed w/out any aio kthread wakeup.  	 */  	if (unlikely(!fput_atomic(req->ki_filp))) { -		get_ioctx(ctx);  		spin_lock(&fput_lock);  		list_add(&req->ki_list, &fput_head);  		spin_unlock(&fput_lock); @@ -920,7 +899,7 @@ static void aio_kick_handler(struct work_struct *work)   	unuse_mm(mm);  	set_fs(oldfs);  	/* -	 * we're in a worker thread already, don't use queue_delayed_work, +	 * we're in a worker thread already; no point using non-zero delay  	 */  	if (requeue)  		queue_delayed_work(aio_wq, &ctx->wq, 0); @@ -1019,10 +998,10 @@ int aio_complete(struct kiocb *iocb, long res, long res2)  	if (kiocbIsCancelled(iocb))  		goto put_rq; -	ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); +	ring = kmap_atomic(info->ring_pages[0]);  	tail = info->tail; -	event = aio_ring_event(info, tail, KM_IRQ0); +	event = aio_ring_event(info, tail);  	if (++tail >= info->nr)  		tail = 0; @@ -1043,8 +1022,8 @@ int aio_complete(struct kiocb *iocb, long res, long res2)  	info->tail = tail;  	ring->tail = tail; -	put_aio_ring_event(event, KM_IRQ0); -	kunmap_atomic(ring, KM_IRQ1); +	put_aio_ring_event(event); +	kunmap_atomic(ring);  	pr_debug("added to ring %p at [%lu]\n", iocb, tail); @@ -1089,7 +1068,7 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)  	unsigned long head;  	int ret = 0; -	ring = kmap_atomic(info->ring_pages[0], KM_USER0); +	ring = kmap_atomic(info->ring_pages[0]);  	dprintk("in aio_read_evt h%lu t%lu m%lu\n",  		 (unsigned long)ring->head, (unsigned long)ring->tail,  		 (unsigned long)ring->nr); @@ -1101,18 +1080,18 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)  	head = ring->head % info->nr;  	if (head != ring->tail) { -		struct io_event *evp = aio_ring_event(info, head, KM_USER1); +		struct io_event *evp = aio_ring_event(info, head);  		*ent = *evp;  		head = (head + 1) % info->nr;  		smp_mb(); /* finish reading the event before updatng the head */  		ring->head = head;  		ret = 1; -		put_aio_ring_event(evp, KM_USER1); +		put_aio_ring_event(evp);  	}  	spin_unlock(&info->ring_lock);  out: -	kunmap_atomic(ring, KM_USER0); +	kunmap_atomic(ring);  	dprintk("leaving aio_read_evt: %d  h%lu t%lu\n", ret,  		 (unsigned long)ring->head, (unsigned long)ring->tail);  	return ret; @@ -1336,10 +1315,10 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)  	ret = PTR_ERR(ioctx);  	if (!IS_ERR(ioctx)) {  		ret = put_user(ioctx->user_id, ctxp); -		if (!ret) +		if (!ret) { +			put_ioctx(ioctx);  			return 0; - -		get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ +		}  		io_destroy(ioctx);  	} diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index f11e43ed907..28d39fb84ae 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -39,19 +39,6 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {  	.d_dname	= anon_inodefs_dname,  }; -static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, -				int flags, const char *dev_name, void *data) -{ -	return mount_pseudo(fs_type, "anon_inode:", NULL, -			&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); -} - -static struct file_system_type anon_inode_fs_type = { -	.name		= "anon_inodefs", -	.mount		= anon_inodefs_mount, -	.kill_sb	= kill_anon_super, -}; -  /*   * nop .set_page_dirty method so that people can use .page_mkwrite on   * anon inodes. @@ -65,6 +52,62 @@ static const struct address_space_operations anon_aops = {  	.set_page_dirty = anon_set_page_dirty,  }; +/* + * A single inode exists for all anon_inode files. Contrary to pipes, + * anon_inode inodes have no associated per-instance data, so we need + * only allocate one of them. + */ +static struct inode *anon_inode_mkinode(struct super_block *s) +{ +	struct inode *inode = new_inode_pseudo(s); + +	if (!inode) +		return ERR_PTR(-ENOMEM); + +	inode->i_ino = get_next_ino(); +	inode->i_fop = &anon_inode_fops; + +	inode->i_mapping->a_ops = &anon_aops; + +	/* +	 * Mark the inode dirty from the very beginning, +	 * that way it will never be moved to the dirty +	 * list because mark_inode_dirty() will think +	 * that it already _is_ on the dirty list. +	 */ +	inode->i_state = I_DIRTY; +	inode->i_mode = S_IRUSR | S_IWUSR; +	inode->i_uid = current_fsuid(); +	inode->i_gid = current_fsgid(); +	inode->i_flags |= S_PRIVATE; +	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +	return inode; +} + +static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, +				int flags, const char *dev_name, void *data) +{ +	struct dentry *root; +	root = mount_pseudo(fs_type, "anon_inode:", NULL, +			&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); +	if (!IS_ERR(root)) { +		struct super_block *s = root->d_sb; +		anon_inode_inode = anon_inode_mkinode(s); +		if (IS_ERR(anon_inode_inode)) { +			dput(root); +			deactivate_locked_super(s); +			root = ERR_CAST(anon_inode_inode); +		} +	} +	return root; +} + +static struct file_system_type anon_inode_fs_type = { +	.name		= "anon_inodefs", +	.mount		= anon_inodefs_mount, +	.kill_sb	= kill_anon_super, +}; +  /**   * anon_inode_getfile - creates a new file instance by hooking it up to an   *                      anonymous inode, and a dentry that describe the "class" @@ -180,38 +223,6 @@ err_put_unused_fd:  }  EXPORT_SYMBOL_GPL(anon_inode_getfd); -/* - * A single inode exists for all anon_inode files. Contrary to pipes, - * anon_inode inodes have no associated per-instance data, so we need - * only allocate one of them. - */ -static struct inode *anon_inode_mkinode(void) -{ -	struct inode *inode = new_inode_pseudo(anon_inode_mnt->mnt_sb); - -	if (!inode) -		return ERR_PTR(-ENOMEM); - -	inode->i_ino = get_next_ino(); -	inode->i_fop = &anon_inode_fops; - -	inode->i_mapping->a_ops = &anon_aops; - -	/* -	 * Mark the inode dirty from the very beginning, -	 * that way it will never be moved to the dirty -	 * list because mark_inode_dirty() will think -	 * that it already _is_ on the dirty list. -	 */ -	inode->i_state = I_DIRTY; -	inode->i_mode = S_IRUSR | S_IWUSR; -	inode->i_uid = current_fsuid(); -	inode->i_gid = current_fsgid(); -	inode->i_flags |= S_PRIVATE; -	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; -	return inode; -} -  static int __init anon_inode_init(void)  {  	int error; @@ -224,16 +235,8 @@ static int __init anon_inode_init(void)  		error = PTR_ERR(anon_inode_mnt);  		goto err_unregister_filesystem;  	} -	anon_inode_inode = anon_inode_mkinode(); -	if (IS_ERR(anon_inode_inode)) { -		error = PTR_ERR(anon_inode_inode); -		goto err_mntput; -	} -  	return 0; -err_mntput: -	kern_unmount(anon_inode_mnt);  err_unregister_filesystem:  	unregister_filesystem(&anon_inode_fs_type);  err_exit: diff --git a/fs/attr.c b/fs/attr.c index 95053ad8abc..73f69a6ce9e 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -5,7 +5,7 @@   *  changes by Thomas Schoebel-Theuer   */ -#include <linux/module.h> +#include <linux/export.h>  #include <linux/time.h>  #include <linux/mm.h>  #include <linux/string.h> diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index d8d8e7ba6a1..eb1cc92cd67 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -110,6 +110,7 @@ struct autofs_sb_info {  	int sub_version;  	int min_proto;  	int max_proto; +	int compat_daemon;  	unsigned long exp_timeout;  	unsigned int type;  	int reghost_enabled; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 76741d8d778..85f1fcdb30e 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -385,6 +385,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,  		sbi->pipefd = pipefd;  		sbi->pipe = pipe;  		sbi->catatonic = 0; +		sbi->compat_daemon = is_compat_task();  	}  out:  	mutex_unlock(&sbi->wq_mutex); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 450f529a4ea..1feb68ecef9 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -124,6 +124,7 @@ start:  	/* Negative dentry - try next */  	if (!simple_positive(q)) {  		spin_unlock(&p->d_lock); +		lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_);  		p = q;  		goto again;  	} @@ -186,6 +187,7 @@ again:  	/* Negative dentry - try next */  	if (!simple_positive(ret)) {  		spin_unlock(&p->d_lock); +		lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_);  		p = ret;  		goto again;  	} diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index c038727b405..cddc74b9cdb 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -31,11 +31,11 @@ static int __init init_autofs4_fs(void)  {  	int err; +	autofs_dev_ioctl_init(); +  	err = register_filesystem(&autofs_fs_type);  	if (err) -		return err; - -	autofs_dev_ioctl_init(); +		autofs_dev_ioctl_exit();  	return err;  } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index e16980b00b8..d8dc002e9cc 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -19,6 +19,7 @@  #include <linux/parser.h>  #include <linux/bitops.h>  #include <linux/magic.h> +#include <linux/compat.h>  #include "autofs_i.h"  #include <linux/module.h> @@ -224,6 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)  	set_autofs_type_indirect(&sbi->type);  	sbi->min_proto = 0;  	sbi->max_proto = 0; +	sbi->compat_daemon = is_compat_task();  	mutex_init(&sbi->wq_mutex);  	mutex_init(&sbi->pipe_mutex);  	spin_lock_init(&sbi->fs_lock); @@ -245,12 +247,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)  	if (!ino)  		goto fail_free;  	root_inode = autofs4_get_inode(s, S_IFDIR | 0755); -	if (!root_inode) -		goto fail_ino; - -	root = d_alloc_root(root_inode); +	root = d_make_root(root_inode);  	if (!root) -		goto fail_iput; +		goto fail_ino;  	pipe = NULL;  	root->d_fsdata = ino; @@ -315,9 +314,6 @@ fail_fput:  fail_dput:  	dput(root);  	goto fail_free; -fail_iput: -	printk("autofs: get root dentry failed\n"); -	iput(root_inode);  fail_ino:  	kfree(ino);  fail_free: diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index da8876d38a7..9c098db4334 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -91,7 +91,24 @@ static int autofs4_write(struct autofs_sb_info *sbi,  	return (bytes > 0);  } -	 + +/* + * The autofs_v5 packet was misdesigned. + * + * The packets are identical on x86-32 and x86-64, but have different + * alignment. Which means that 'sizeof()' will give different results. + * Fix it up for the case of running 32-bit user mode on a 64-bit kernel. + */ +static noinline size_t autofs_v5_packet_size(struct autofs_sb_info *sbi) +{ +	size_t pktsz = sizeof(struct autofs_v5_packet); +#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) +	if (sbi->compat_daemon > 0) +		pktsz -= 4; +#endif +	return pktsz; +} +  static void autofs4_notify_daemon(struct autofs_sb_info *sbi,  				 struct autofs_wait_queue *wq,  				 int type) @@ -155,8 +172,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,  	{  		struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; -		pktsz = sizeof(*packet); - +		pktsz = autofs_v5_packet_size(sbi);  		packet->wait_queue_token = wq->wait_queue_token;  		packet->len = wq->name.len;  		memcpy(packet->name, wq->name.name, wq->name.len); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 22e9a78872f..37268c5bb98 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -9,7 +9,7 @@   */  #include <linux/fs.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/stat.h>  #include <linux/time.h>  #include <linux/namei.h> diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 6e6d536767f..e18da23d42b 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -852,9 +852,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)  		ret = PTR_ERR(root);  		goto unacquire_priv_sbp;  	} -	sb->s_root = d_alloc_root(root); +	sb->s_root = d_make_root(root);  	if (!sb->s_root) { -		iput(root);  		befs_error(sb, "get root inode failed");  		goto unacquire_priv_sbp;  	} diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index b0391bc402b..e23dc7c8b88 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -367,9 +367,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)  		ret = PTR_ERR(inode);  		goto out2;  	} -	s->s_root = d_alloc_root(inode); +	s->s_root = d_make_root(inode);  	if (!s->s_root) { -		iput(inode);  		ret = -ENOMEM;  		goto out2;  	} diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index a6395bdb26a..4d5e6d26578 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -259,8 +259,14 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)  	current->mm->free_area_cache = current->mm->mmap_base;  	current->mm->cached_hole_size = 0; +	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); +	if (retval < 0) { +		/* Someone check-me: is this error path enough? */ +		send_sig(SIGKILL, current, 0); +		return retval; +	} +  	install_exec_creds(bprm); - 	current->flags &= ~PF_FORKNOEXEC;  	if (N_MAGIC(ex) == OMAGIC) {  		unsigned long text_addr, map_size; @@ -352,13 +358,6 @@ beyond_if:  		return retval;  	} -	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); -	if (retval < 0) {  -		/* Someone check-me: is this error path enough? */  -		send_sig(SIGKILL, current, 0);  -		return retval; -	} -  	current->mm->start_stack =  		(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);  #ifdef __alpha__ @@ -454,7 +453,8 @@ out:  static int __init init_aout_binfmt(void)  { -	return register_binfmt(&aout_format); +	register_binfmt(&aout_format); +	return 0;  }  static void __exit exit_aout_binfmt(void) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index bcb884e2d61..504b6eee50a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -712,7 +712,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)  		goto out_free_dentry;  	/* OK, This is the point of no return */ -	current->flags &= ~PF_FORKNOEXEC;  	current->mm->def_flags = def_flags;  	/* Do this immediately, since STACK_TOP as used in setup_arg_pages @@ -934,7 +933,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)  #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */  	install_exec_creds(bprm); -	current->flags &= ~PF_FORKNOEXEC;  	retval = create_elf_tables(bprm, &loc->elf_ex,  			  load_addr, interp_load_addr);  	if (retval < 0) { @@ -1095,6 +1093,29 @@ out:   */  /* + * The purpose of always_dump_vma() is to make sure that special kernel mappings + * that are useful for post-mortem analysis are included in every core dump. + * In that way we ensure that the core dump is fully interpretable later + * without matching up the same kernel and hardware config to see what PC values + * meant. These special mappings include - vDSO, vsyscall, and other + * architecture specific mappings + */ +static bool always_dump_vma(struct vm_area_struct *vma) +{ +	/* Any vsyscall mappings? */ +	if (vma == get_gate_vma(vma->vm_mm)) +		return true; +	/* +	 * arch_vma_name() returns non-NULL for special architecture mappings, +	 * such as vDSO sections. +	 */ +	if (arch_vma_name(vma)) +		return true; + +	return false; +} + +/*   * Decide what to dump of a segment, part, all or none.   */  static unsigned long vma_dump_size(struct vm_area_struct *vma, @@ -1102,10 +1123,13 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,  {  #define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type)) -	/* The vma can be set up to tell us the answer directly.  */ -	if (vma->vm_flags & VM_ALWAYSDUMP) +	/* always dump the vdso and vsyscall sections */ +	if (always_dump_vma(vma))  		goto whole; +	if (vma->vm_flags & VM_NODUMP) +		return 0; +  	/* Hugetlb memory check */  	if (vma->vm_flags & VM_HUGETLB) {  		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) @@ -1421,7 +1445,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,  	for (i = 1; i < view->n; ++i) {  		const struct user_regset *regset = &view->regsets[i];  		do_thread_regset_writeback(t->task, regset); -		if (regset->core_note_type && +		if (regset->core_note_type && regset->get &&  		    (!regset->active || regset->active(t->task, regset))) {  			int ret;  			size_t size = regset->n * regset->size; @@ -2077,7 +2101,8 @@ out:  static int __init init_elf_binfmt(void)  { -	return register_binfmt(&elf_format); +	register_binfmt(&elf_format); +	return 0;  }  static void __exit exit_elf_binfmt(void) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 30745f459fa..c64bf5ee2df 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -91,7 +91,8 @@ static struct linux_binfmt elf_fdpic_format = {  static int __init init_elf_fdpic_binfmt(void)  { -	return register_binfmt(&elf_fdpic_format); +	register_binfmt(&elf_fdpic_format); +	return 0;  }  static void __exit exit_elf_fdpic_binfmt(void) @@ -334,8 +335,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,  	current->mm->context.exec_fdpic_loadmap = 0;  	current->mm->context.interp_fdpic_loadmap = 0; -	current->flags &= ~PF_FORKNOEXEC; -  #ifdef CONFIG_MMU  	elf_fdpic_arch_lay_out_mm(&exec_params,  				  &interp_params, @@ -413,7 +412,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,  #endif  	install_exec_creds(bprm); -	current->flags &= ~PF_FORKNOEXEC;  	if (create_elf_fdpic_tables(bprm, current->mm,  				    &exec_params, &interp_params) < 0)  		goto error_kill; diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index b8e8b0acf9b..2790c7e1912 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -100,7 +100,8 @@ static struct linux_binfmt em86_format = {  static int __init init_em86_binfmt(void)  { -	return register_binfmt(&em86_format); +	register_binfmt(&em86_format); +	return 0;  }  static void __exit exit_em86_binfmt(void) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 1bffbe0ed77..5979027451b 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -15,7 +15,7 @@   *	JAN/99 -- coded full program relocation (gerg@snapgear.com)   */ -#include <linux/module.h> +#include <linux/export.h>  #include <linux/kernel.h>  #include <linux/sched.h>  #include <linux/mm.h> @@ -902,7 +902,6 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)  						libinfo.lib_list[j].start_data:UNLOADED_LIB;  	install_exec_creds(bprm); - 	current->flags &= ~PF_FORKNOEXEC;  	set_binfmt(&flat_format); @@ -950,7 +949,8 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)  static int __init init_flat_binfmt(void)  { -	return register_binfmt(&flat_format); +	register_binfmt(&flat_format); +	return 0;  }  /****************************************************************************/ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a9198dfd5f8..613aa061823 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -19,6 +19,7 @@  #include <linux/module.h>  #include <linux/init.h>  #include <linux/sched.h> +#include <linux/magic.h>  #include <linux/binfmts.h>  #include <linux/slab.h>  #include <linux/ctype.h> @@ -699,7 +700,7 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)  		[3] = {"register", &bm_register_operations, S_IWUSR},  		/* last one */ {""}  	}; -	int err = simple_fill_super(sb, 0x42494e4d, bm_files); +	int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);  	if (!err)  		sb->s_op = &s_ops;  	return err; @@ -726,11 +727,8 @@ static struct file_system_type bm_fs_type = {  static int __init init_misc_binfmt(void)  {  	int err = register_filesystem(&bm_fs_type); -	if (!err) { -		err = insert_binfmt(&misc_format); -		if (err) -			unregister_filesystem(&bm_fs_type); -	} +	if (!err) +		insert_binfmt(&misc_format);  	return err;  } diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 396a9884591..d3b8c1f6315 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -105,7 +105,8 @@ static struct linux_binfmt script_format = {  static int __init init_script_binfmt(void)  { -	return register_binfmt(&script_format); +	register_binfmt(&script_format); +	return 0;  }  static void __exit exit_script_binfmt(void) diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index cc8560f6c9b..e4fc746629a 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -225,7 +225,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)  		goto out_free;  	/* OK, This is the point of no return */ -	current->flags &= ~PF_FORKNOEXEC;  	current->personality = PER_HPUX;  	setup_new_exec(bprm); @@ -289,7 +288,8 @@ static int load_som_library(struct file *f)  static int __init init_som_binfmt(void)  { -	return register_binfmt(&som_format); +	register_binfmt(&som_format); +	return 0;  }  static void __exit exit_som_binfmt(void) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index c2183f3917c..e85c04b9f61 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -357,7 +357,7 @@ static void bio_integrity_generate(struct bio *bio)  	bix.sector_size = bi->sector_size;  	bio_for_each_segment(bv, bio, i) { -		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); +		void *kaddr = kmap_atomic(bv->bv_page);  		bix.data_buf = kaddr + bv->bv_offset;  		bix.data_size = bv->bv_len;  		bix.prot_buf = prot_buf; @@ -371,7 +371,7 @@ static void bio_integrity_generate(struct bio *bio)  		total += sectors * bi->tuple_size;  		BUG_ON(total > bio->bi_integrity->bip_size); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  	}  } @@ -498,7 +498,7 @@ static int bio_integrity_verify(struct bio *bio)  	bix.sector_size = bi->sector_size;  	bio_for_each_segment(bv, bio, i) { -		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); +		void *kaddr = kmap_atomic(bv->bv_page);  		bix.data_buf = kaddr + bv->bv_offset;  		bix.data_size = bv->bv_len;  		bix.prot_buf = prot_buf; @@ -507,7 +507,7 @@ static int bio_integrity_verify(struct bio *bio)  		ret = bi->verify_fn(&bix);  		if (ret) { -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr);  			return ret;  		} @@ -517,7 +517,7 @@ static int bio_integrity_verify(struct bio *bio)  		total += sectors * bi->tuple_size;  		BUG_ON(total > bio->bi_integrity->bip_size); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  	}  	return ret; @@ -22,7 +22,7 @@  #include <linux/slab.h>  #include <linux/init.h>  #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/mempool.h>  #include <linux/workqueue.h>  #include <scsi/sg.h>		/* for struct sg_iovec */ @@ -505,13 +505,9 @@ EXPORT_SYMBOL(bio_clone);  int bio_get_nr_vecs(struct block_device *bdev)  {  	struct request_queue *q = bdev_get_queue(bdev); -	int nr_pages; - -	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; -	if (nr_pages > queue_max_segments(q)) -		nr_pages = queue_max_segments(q); - -	return nr_pages; +	return min_t(unsigned, +		     queue_max_segments(q), +		     queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);  }  EXPORT_SYMBOL(bio_get_nr_vecs); diff --git a/fs/block_dev.c b/fs/block_dev.c index 0e575d1304b..e08f6a20a5b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -16,6 +16,7 @@  #include <linux/blkdev.h>  #include <linux/module.h>  #include <linux/blkpg.h> +#include <linux/magic.h>  #include <linux/buffer_head.h>  #include <linux/swap.h>  #include <linux/pagevec.h> @@ -109,7 +110,7 @@ void invalidate_bdev(struct block_device *bdev)  	/* 99% of the time, we don't need to flush the cleancache on the bdev.  	 * But, for the strange corners, lets be cautious  	 */ -	cleancache_flush_inode(mapping); +	cleancache_invalidate_inode(mapping);  }  EXPORT_SYMBOL(invalidate_bdev); @@ -506,7 +507,7 @@ static const struct super_operations bdev_sops = {  static struct dentry *bd_mount(struct file_system_type *fs_type,  	int flags, const char *dev_name, void *data)  { -	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, 0x62646576); +	return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);  }  static struct file_system_type bd_type = { @@ -1183,8 +1184,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)  			 * The latter is necessary to prevent ghost  			 * partitions on a removed medium.  			 */ -			if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) -				rescan_partitions(disk, bdev); +			if (bdev->bd_invalidated) { +				if (!ret) +					rescan_partitions(disk, bdev); +				else if (ret == -ENOMEDIUM) +					invalidate_partitions(disk, bdev); +			}  			if (ret)  				goto out_clear;  		} else { @@ -1214,8 +1219,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)  			if (bdev->bd_disk->fops->open)  				ret = bdev->bd_disk->fops->open(bdev, mode);  			/* the same as first opener case, read comment there */ -			if (bdev->bd_invalidated && (!ret || ret == -ENOMEDIUM)) -				rescan_partitions(bdev->bd_disk, bdev); +			if (bdev->bd_invalidated) { +				if (!ret) +					rescan_partitions(bdev->bd_disk, bdev); +				else if (ret == -ENOMEDIUM) +					invalidate_partitions(bdev->bd_disk, bdev); +			}  			if (ret)  				goto out_unlock_bdev;  		} diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b9a843226de..0436c12da8c 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -297,7 +297,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,  	struct btrfs_delayed_extent_op *extent_op = head->extent_op;  	struct rb_node *n = &head->node.rb_node;  	int sgn; -	int ret; +	int ret = 0;  	if (extent_op && extent_op->update_key)  		btrfs_disk_key_to_cpu(info_key, &extent_op->key); @@ -392,7 +392,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,  			     struct btrfs_key *info_key, int *info_level,  			     struct list_head *prefs)  { -	int ret; +	int ret = 0;  	int slot;  	struct extent_buffer *leaf;  	struct btrfs_key key; @@ -583,7 +583,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,  	struct btrfs_path *path;  	struct btrfs_key info_key = { 0 };  	struct btrfs_delayed_ref_root *delayed_refs = NULL; -	struct btrfs_delayed_ref_head *head = NULL; +	struct btrfs_delayed_ref_head *head;  	int info_level = 0;  	int ret;  	struct list_head prefs_delayed; @@ -607,6 +607,8 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,  	 * at a specified point in time  	 */  again: +	head = NULL; +  	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);  	if (ret < 0)  		goto out; @@ -635,8 +637,10 @@ again:  			goto again;  		}  		ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed); -		if (ret) +		if (ret) { +			spin_unlock(&delayed_refs->lock);  			goto out; +		}  	}  	spin_unlock(&delayed_refs->lock); @@ -892,6 +896,8 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,  		if (eb != eb_in)  			free_extent_buffer(eb);  		ret = inode_ref_info(parent, 0, fs_root, path, &found_key); +		if (ret > 0) +			ret = -ENOENT;  		if (ret)  			break;  		next_inum = found_key.offset; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ad0b3ba735b..c053e90f200 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -89,7 +89,6 @@  #include "disk-io.h"  #include "transaction.h"  #include "extent_io.h" -#include "disk-io.h"  #include "volumes.h"  #include "print-tree.h"  #include "locking.h" @@ -644,7 +643,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(  static int btrfsic_process_superblock(struct btrfsic_state *state,  				      struct btrfs_fs_devices *fs_devices)  { -	int ret; +	int ret = 0;  	struct btrfs_super_block *selected_super;  	struct list_head *dev_head = &fs_devices->devices;  	struct btrfs_device *device; @@ -1662,7 +1661,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,  	block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,  					       &state->block_hashtable);  	if (NULL != block) { -		u64 bytenr; +		u64 bytenr = 0;  		struct list_head *elem_ref_to;  		struct list_head *tmp_ref_to; @@ -2777,9 +2776,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)  			printk(KERN_INFO  			       "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"  			       " size=%lu, data=%p, bdev=%p)\n", -			       rw, bh->b_blocknr, -			       (unsigned long long)dev_bytenr, bh->b_size, -			       bh->b_data, bh->b_bdev); +			       rw, (unsigned long)bh->b_blocknr, +			       (unsigned long long)dev_bytenr, +			       (unsigned long)bh->b_size, bh->b_data, +			       bh->b_bdev);  		btrfsic_process_written_block(dev_state, dev_bytenr,  					      bh->b_data, bh->b_size, NULL,  					      NULL, bh, rw); @@ -2844,7 +2844,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)  			printk(KERN_INFO  			       "submit_bio(rw=0x%x, bi_vcnt=%u,"  			       " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", -			       rw, bio->bi_vcnt, bio->bi_sector, +			       rw, bio->bi_vcnt, (unsigned long)bio->bi_sector,  			       (unsigned long long)dev_bytenr,  			       bio->bi_bdev); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 14f1c5a0b2d..b805afb37fa 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -120,10 +120,10 @@ static int check_compressed_csum(struct inode *inode,  		page = cb->compressed_pages[i];  		csum = ~(u32)0; -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = kmap_atomic(page);  		csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);  		btrfs_csum_final(csum, (char *)&csum); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		if (csum != *cb_sum) {  			printk(KERN_INFO "btrfs csum failed ino %llu " @@ -521,10 +521,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,  			if (zero_offset) {  				int zeros;  				zeros = PAGE_CACHE_SIZE - zero_offset; -				userpage = kmap_atomic(page, KM_USER0); +				userpage = kmap_atomic(page);  				memset(userpage + zero_offset, 0, zeros);  				flush_dcache_page(page); -				kunmap_atomic(userpage, KM_USER0); +				kunmap_atomic(userpage);  			}  		} @@ -588,6 +588,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,  				   page_offset(bio->bi_io_vec->bv_page),  				   PAGE_CACHE_SIZE);  	read_unlock(&em_tree->lock); +	if (!em) +		return -EIO;  	compressed_len = em->block_len;  	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); @@ -991,9 +993,9 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,  		bytes = min(PAGE_CACHE_SIZE - *pg_offset,  			    PAGE_CACHE_SIZE - buf_offset);  		bytes = min(bytes, working_bytes); -		kaddr = kmap_atomic(page_out, KM_USER0); +		kaddr = kmap_atomic(page_out);  		memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		flush_dcache_page(page_out);  		*pg_offset += bytes; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 27ebe61d3cc..80b6486fd5e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -886,7 +886,7 @@ struct btrfs_block_rsv {  	u64 reserved;  	struct btrfs_space_info *space_info;  	spinlock_t lock; -	unsigned int full:1; +	unsigned int full;  };  /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7aa9cd36bf1..534266fe505 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -962,6 +962,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)  	tree = &BTRFS_I(page->mapping->host)->io_tree;  	map = &BTRFS_I(page->mapping->host)->extent_tree; +	/* +	 * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing +	 * slab allocation from alloc_extent_state down the callchain where +	 * it'd hit a BUG_ON as those flags are not allowed. +	 */ +	gfp_flags &= ~GFP_SLAB_BUG_MASK; +  	ret = try_release_extent_state(map, tree, page, gfp_flags);  	if (!ret)  		return 0; @@ -2253,6 +2260,12 @@ int open_ctree(struct super_block *sb,  		goto fail_sb_buffer;  	} +	if (sectorsize < PAGE_SIZE) { +		printk(KERN_WARNING "btrfs: Incompatible sector size " +		       "found on %s\n", sb->s_id); +		goto fail_sb_buffer; +	} +  	mutex_lock(&fs_info->chunk_mutex);  	ret = btrfs_read_sys_array(tree_root);  	mutex_unlock(&fs_info->chunk_mutex); @@ -2294,6 +2307,12 @@ int open_ctree(struct super_block *sb,  	btrfs_close_extra_devices(fs_devices); +	if (!fs_devices->latest_bdev) { +		printk(KERN_CRIT "btrfs: failed to read devices on %s\n", +		       sb->s_id); +		goto fail_tree_roots; +	} +  retry_root_backup:  	blocksize = btrfs_level_size(tree_root,  				     btrfs_super_root_level(disk_super)); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 700879ed64c..37e0a800d34 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,23 +34,24 @@  #include "locking.h"  #include "free-space-cache.h" -/* control flags for do_chunk_alloc's force field +/* + * control flags for do_chunk_alloc's force field   * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk   * if we really need one.   * - * CHUNK_ALLOC_FORCE means it must try to allocate one - *   * CHUNK_ALLOC_LIMITED means to only try and allocate one   * if we have very few chunks already allocated.  This is   * used as part of the clustering code to help make sure   * we have a good pool of storage to cluster in, without   * filling the FS with empty chunks   * + * CHUNK_ALLOC_FORCE means it must try to allocate one + *   */  enum {  	CHUNK_ALLOC_NO_FORCE = 0, -	CHUNK_ALLOC_FORCE = 1, -	CHUNK_ALLOC_LIMITED = 2, +	CHUNK_ALLOC_LIMITED = 1, +	CHUNK_ALLOC_FORCE = 2,  };  /* @@ -3311,7 +3312,8 @@ commit_trans:  	}  	data_sinfo->bytes_may_use += bytes;  	trace_btrfs_space_reservation(root->fs_info, "space_info", -				      (u64)data_sinfo, bytes, 1); +				      (u64)(unsigned long)data_sinfo, +				      bytes, 1);  	spin_unlock(&data_sinfo->lock);  	return 0; @@ -3332,7 +3334,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)  	spin_lock(&data_sinfo->lock);  	data_sinfo->bytes_may_use -= bytes;  	trace_btrfs_space_reservation(root->fs_info, "space_info", -				      (u64)data_sinfo, bytes, 0); +				      (u64)(unsigned long)data_sinfo, +				      bytes, 0);  	spin_unlock(&data_sinfo->lock);  } @@ -3414,7 +3417,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,  again:  	spin_lock(&space_info->lock); -	if (space_info->force_alloc) +	if (force < space_info->force_alloc)  		force = space_info->force_alloc;  	if (space_info->full) {  		spin_unlock(&space_info->lock); @@ -3610,12 +3613,15 @@ static int may_commit_transaction(struct btrfs_root *root,  	if (space_info != delayed_rsv->space_info)  		return -ENOSPC; +	spin_lock(&space_info->lock);  	spin_lock(&delayed_rsv->lock); -	if (delayed_rsv->size < bytes) { +	if (space_info->bytes_pinned + delayed_rsv->size < bytes) {  		spin_unlock(&delayed_rsv->lock); +		spin_unlock(&space_info->lock);  		return -ENOSPC;  	}  	spin_unlock(&delayed_rsv->lock); +	spin_unlock(&space_info->lock);  commit:  	trans = btrfs_join_transaction(root); @@ -3694,9 +3700,9 @@ again:  		if (used + orig_bytes <= space_info->total_bytes) {  			space_info->bytes_may_use += orig_bytes;  			trace_btrfs_space_reservation(root->fs_info, -						      "space_info", -						      (u64)space_info, -						      orig_bytes, 1); +					      "space_info", +					      (u64)(unsigned long)space_info, +					      orig_bytes, 1);  			ret = 0;  		} else {  			/* @@ -3765,9 +3771,9 @@ again:  		if (used + num_bytes < space_info->total_bytes + avail) {  			space_info->bytes_may_use += orig_bytes;  			trace_btrfs_space_reservation(root->fs_info, -						      "space_info", -						      (u64)space_info, -						      orig_bytes, 1); +					      "space_info", +					      (u64)(unsigned long)space_info, +					      orig_bytes, 1);  			ret = 0;  		} else {  			wait_ordered = true; @@ -3912,8 +3918,8 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,  			spin_lock(&space_info->lock);  			space_info->bytes_may_use -= num_bytes;  			trace_btrfs_space_reservation(fs_info, "space_info", -						      (u64)space_info, -						      num_bytes, 0); +					      (u64)(unsigned long)space_info, +					      num_bytes, 0);  			space_info->reservation_progress++;  			spin_unlock(&space_info->lock);  		} @@ -4104,7 +4110,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)  	num_bytes += div64_u64(data_used + meta_used, 50);  	if (num_bytes * 3 > meta_used) -		num_bytes = div64_u64(meta_used, 3); +		num_bytes = div64_u64(meta_used, 3) * 2;  	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);  } @@ -4131,14 +4137,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)  		block_rsv->reserved += num_bytes;  		sinfo->bytes_may_use += num_bytes;  		trace_btrfs_space_reservation(fs_info, "space_info", -					      (u64)sinfo, num_bytes, 1); +				      (u64)(unsigned long)sinfo, num_bytes, 1);  	}  	if (block_rsv->reserved >= block_rsv->size) {  		num_bytes = block_rsv->reserved - block_rsv->size;  		sinfo->bytes_may_use -= num_bytes;  		trace_btrfs_space_reservation(fs_info, "space_info", -					      (u64)sinfo, num_bytes, 0); +				      (u64)(unsigned long)sinfo, num_bytes, 0);  		sinfo->reservation_progress++;  		block_rsv->reserved = block_rsv->size;  		block_rsv->full = 1; @@ -4191,7 +4197,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,  	if (!trans->bytes_reserved)  		return; -	trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans, +	trace_btrfs_space_reservation(root->fs_info, "transaction", +				      (u64)(unsigned long)trans,  				      trans->bytes_reserved, 0);  	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);  	trans->bytes_reserved = 0; @@ -4709,9 +4716,9 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,  			space_info->bytes_reserved += num_bytes;  			if (reserve == RESERVE_ALLOC) {  				trace_btrfs_space_reservation(cache->fs_info, -							      "space_info", -							      (u64)space_info, -							      num_bytes, 0); +					      "space_info", +					      (u64)(unsigned long)space_info, +					      num_bytes, 0);  				space_info->bytes_may_use -= num_bytes;  			}  		} @@ -5794,6 +5801,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,  			 u64 search_end, struct btrfs_key *ins,  			 u64 data)  { +	bool final_tried = false;  	int ret;  	u64 search_start = 0; @@ -5813,22 +5821,25 @@ again:  			       search_start, search_end, hint_byte,  			       ins, data); -	if (ret == -ENOSPC && num_bytes > min_alloc_size) { -		num_bytes = num_bytes >> 1; -		num_bytes = num_bytes & ~(root->sectorsize - 1); -		num_bytes = max(num_bytes, min_alloc_size); -		do_chunk_alloc(trans, root->fs_info->extent_root, -			       num_bytes, data, CHUNK_ALLOC_FORCE); -		goto again; -	} -	if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { -		struct btrfs_space_info *sinfo; +	if (ret == -ENOSPC) { +		if (!final_tried) { +			num_bytes = num_bytes >> 1; +			num_bytes = num_bytes & ~(root->sectorsize - 1); +			num_bytes = max(num_bytes, min_alloc_size); +			do_chunk_alloc(trans, root->fs_info->extent_root, +				       num_bytes, data, CHUNK_ALLOC_FORCE); +			if (num_bytes == min_alloc_size) +				final_tried = true; +			goto again; +		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { +			struct btrfs_space_info *sinfo; -		sinfo = __find_space_info(root->fs_info, data); -		printk(KERN_ERR "btrfs allocation failed flags %llu, " -		       "wanted %llu\n", (unsigned long long)data, -		       (unsigned long long)num_bytes); -		dump_space_info(sinfo, num_bytes, 1); +			sinfo = __find_space_info(root->fs_info, data); +			printk(KERN_ERR "btrfs allocation failed flags %llu, " +			       "wanted %llu\n", (unsigned long long)data, +			       (unsigned long long)num_bytes); +			dump_space_info(sinfo, num_bytes, 1); +		}  	}  	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); @@ -7881,9 +7892,16 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)  	u64 start;  	u64 end;  	u64 trimmed = 0; +	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);  	int ret = 0; -	cache = btrfs_lookup_block_group(fs_info, range->start); +	/* +	 * try to trim all FS space, our block group may start from non-zero. +	 */ +	if (range->len == total_bytes) +		cache = btrfs_lookup_first_block_group(fs_info, range->start); +	else +		cache = btrfs_lookup_block_group(fs_info, range->start);  	while (cache) {  		if (cache->key.objectid >= (range->start + range->len)) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9d09a4f8187..2862454bcdb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -513,6 +513,15 @@ hit_next:  	WARN_ON(state->end < start);  	last_end = state->end; +	if (state->end < end && !need_resched()) +		next_node = rb_next(&state->rb_node); +	else +		next_node = NULL; + +	/* the state doesn't have the wanted bits, go ahead */ +	if (!(state->state & bits)) +		goto next; +  	/*  	 *     | ---- desired range ---- |  	 *  | state | or @@ -565,20 +574,15 @@ hit_next:  		goto out;  	} -	if (state->end < end && prealloc && !need_resched()) -		next_node = rb_next(&state->rb_node); -	else -		next_node = NULL; -  	set |= clear_state_bit(tree, state, &bits, wake); +next:  	if (last_end == (u64)-1)  		goto out;  	start = last_end + 1;  	if (start <= end && next_node) {  		state = rb_entry(next_node, struct extent_state,  				 rb_node); -		if (state->start == start) -			goto hit_next; +		goto hit_next;  	}  	goto search_again; @@ -961,8 +965,6 @@ hit_next:  		set_state_bits(tree, state, &bits);  		clear_state_bit(tree, state, &clear_bits, 0); - -		merge_state(tree, state);  		if (last_end == (u64)-1)  			goto out; @@ -1007,7 +1009,6 @@ hit_next:  		if (state->end <= end) {  			set_state_bits(tree, state, &bits);  			clear_state_bit(tree, state, &clear_bits, 0); -			merge_state(tree, state);  			if (last_end == (u64)-1)  				goto out;  			start = last_end + 1; @@ -1068,8 +1069,6 @@ hit_next:  		set_state_bits(tree, prealloc, &bits);  		clear_state_bit(tree, prealloc, &clear_bits, 0); - -		merge_state(tree, prealloc);  		prealloc = NULL;  		goto out;  	} @@ -2154,13 +2153,46 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,  		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,  		 failrec->this_mirror, num_copies, failrec->in_validation); -	tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, -					failrec->bio_flags, 0); -	return 0; +	ret = tree->ops->submit_bio_hook(inode, read_mode, bio, +					 failrec->this_mirror, +					 failrec->bio_flags, 0); +	return ret;  }  /* lots and lots of room for performance fixes in the end_bio funcs */ +int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +{ +	int uptodate = (err == 0); +	struct extent_io_tree *tree; +	int ret; + +	tree = &BTRFS_I(page->mapping->host)->io_tree; + +	if (tree->ops && tree->ops->writepage_end_io_hook) { +		ret = tree->ops->writepage_end_io_hook(page, start, +					       end, NULL, uptodate); +		if (ret) +			uptodate = 0; +	} + +	if (!uptodate && tree->ops && +	    tree->ops->writepage_io_failed_hook) { +		ret = tree->ops->writepage_io_failed_hook(NULL, page, +						 start, end, NULL); +		/* Writeback already completed */ +		if (ret == 0) +			return 1; +	} + +	if (!uptodate) { +		clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); +		ClearPageUptodate(page); +		SetPageError(page); +	} +	return 0; +} +  /*   * after a writepage IO is done, we need to:   * clear the uptodate bits on error @@ -2172,13 +2204,11 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,   */  static void end_bio_extent_writepage(struct bio *bio, int err)  { -	int uptodate = err == 0;  	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;  	struct extent_io_tree *tree;  	u64 start;  	u64 end;  	int whole_page; -	int ret;  	do {  		struct page *page = bvec->bv_page; @@ -2195,28 +2225,9 @@ static void end_bio_extent_writepage(struct bio *bio, int err)  		if (--bvec >= bio->bi_io_vec)  			prefetchw(&bvec->bv_page->flags); -		if (tree->ops && tree->ops->writepage_end_io_hook) { -			ret = tree->ops->writepage_end_io_hook(page, start, -						       end, NULL, uptodate); -			if (ret) -				uptodate = 0; -		} - -		if (!uptodate && tree->ops && -		    tree->ops->writepage_io_failed_hook) { -			ret = tree->ops->writepage_io_failed_hook(bio, page, -							 start, end, NULL); -			if (ret == 0) { -				uptodate = (err == 0); -				continue; -			} -		} -		if (!uptodate) { -			clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); -			ClearPageUptodate(page); -			SetPageError(page); -		} +		if (end_extent_writepage(page, err, start, end)) +			continue;  		if (whole_page)  			end_page_writeback(page); @@ -2535,10 +2546,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree,  		if (zero_offset) {  			iosize = PAGE_CACHE_SIZE - zero_offset; -			userpage = kmap_atomic(page, KM_USER0); +			userpage = kmap_atomic(page);  			memset(userpage + zero_offset, 0, iosize);  			flush_dcache_page(page); -			kunmap_atomic(userpage, KM_USER0); +			kunmap_atomic(userpage);  		}  	}  	while (cur <= end) { @@ -2547,10 +2558,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree,  			struct extent_state *cached = NULL;  			iosize = PAGE_CACHE_SIZE - pg_offset; -			userpage = kmap_atomic(page, KM_USER0); +			userpage = kmap_atomic(page);  			memset(userpage + pg_offset, 0, iosize);  			flush_dcache_page(page); -			kunmap_atomic(userpage, KM_USER0); +			kunmap_atomic(userpage);  			set_extent_uptodate(tree, cur, cur + iosize - 1,  					    &cached, GFP_NOFS);  			unlock_extent_cached(tree, cur, cur + iosize - 1, @@ -2596,10 +2607,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree,  			char *userpage;  			struct extent_state *cached = NULL; -			userpage = kmap_atomic(page, KM_USER0); +			userpage = kmap_atomic(page);  			memset(userpage + pg_offset, 0, iosize);  			flush_dcache_page(page); -			kunmap_atomic(userpage, KM_USER0); +			kunmap_atomic(userpage);  			set_extent_uptodate(tree, cur, cur + iosize - 1,  					    &cached, GFP_NOFS); @@ -2745,10 +2756,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	if (page->index == end_index) {  		char *userpage; -		userpage = kmap_atomic(page, KM_USER0); +		userpage = kmap_atomic(page);  		memset(userpage + pg_offset, 0,  		       PAGE_CACHE_SIZE - pg_offset); -		kunmap_atomic(userpage, KM_USER0); +		kunmap_atomic(userpage);  		flush_dcache_page(page);  	}  	pg_offset = 0; @@ -2779,9 +2790,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  				delalloc_start = delalloc_end + 1;  				continue;  			} -			tree->ops->fill_delalloc(inode, page, delalloc_start, -						 delalloc_end, &page_started, -						 &nr_written); +			ret = tree->ops->fill_delalloc(inode, page, +						       delalloc_start, +						       delalloc_end, +						       &page_started, +						       &nr_written); +			BUG_ON(ret);  			/*  			 * delalloc_end is already one less than the total  			 * length, so we don't subtract one from @@ -2818,8 +2832,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	if (tree->ops && tree->ops->writepage_start_hook) {  		ret = tree->ops->writepage_start_hook(page, start,  						      page_end); -		if (ret == -EAGAIN) { -			redirty_page_for_writepage(wbc, page); +		if (ret) { +			/* Fixup worker will requeue */ +			if (ret == -EBUSY) +				wbc->pages_skipped++; +			else +				redirty_page_for_writepage(wbc, page);  			update_nr_written(page, wbc, nr_written);  			unlock_page(page);  			ret = 0; @@ -3289,7 +3307,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,  			len = end - start + 1;  			write_lock(&map->lock);  			em = lookup_extent_mapping(map, start, len); -			if (IS_ERR_OR_NULL(em)) { +			if (!em) {  				write_unlock(&map->lock);  				break;  			} @@ -3853,10 +3871,9 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,  	num_pages = num_extent_pages(eb->start, eb->len);  	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); -	if (eb_straddles_pages(eb)) { -		clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, -				      cached_state, GFP_NOFS); -	} +	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, +			      cached_state, GFP_NOFS); +  	for (i = 0; i < num_pages; i++) {  		page = extent_buffer_page(eb, i);  		if (page) @@ -3909,6 +3926,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,  	while (start <= end) {  		index = start >> PAGE_CACHE_SHIFT;  		page = find_get_page(tree->mapping, index); +		if (!page) +			return 1;  		uptodate = PageUptodate(page);  		page_cache_release(page);  		if (!uptodate) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bc6a042cb6f..cecc3518c12 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -319,4 +319,5 @@ struct btrfs_mapping_tree;  int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,  			u64 length, u64 logical, struct page *page,  			int mirror_num); +int end_extent_writepage(struct page *page, int err, u64 start, u64 end);  #endif diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 33a7890b1f4..1195f09761f 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -26,8 +26,8 @@ struct extent_map {  	unsigned long flags;  	struct block_device *bdev;  	atomic_t refs; -	unsigned int in_tree:1; -	unsigned int compress_type:4; +	unsigned int in_tree; +	unsigned int compress_type;  };  struct extent_map_tree { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c7fb3a4247d..078b4fd5450 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -447,13 +447,13 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,  			sums->bytenr = ordered->start;  		} -		data = kmap_atomic(bvec->bv_page, KM_USER0); +		data = kmap_atomic(bvec->bv_page);  		sector_sum->sum = ~(u32)0;  		sector_sum->sum = btrfs_csum_data(root,  						  data + bvec->bv_offset,  						  sector_sum->sum,  						  bvec->bv_len); -		kunmap_atomic(data, KM_USER0); +		kunmap_atomic(data);  		btrfs_csum_final(sector_sum->sum,  				 (char *)§or_sum->sum);  		sector_sum->bytenr = disk_bytenr; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 859ba2dd889..e8d06b6b919 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1605,6 +1605,14 @@ static long btrfs_fallocate(struct file *file, int mode,  		return -EOPNOTSUPP;  	/* +	 * Make sure we have enough space before we do the +	 * allocation. +	 */ +	ret = btrfs_check_data_free_space(inode, len); +	if (ret) +		return ret; + +	/*  	 * wait for ordered IO before we have any locks.  We'll loop again  	 * below with the locks held.  	 */ @@ -1667,27 +1675,12 @@ static long btrfs_fallocate(struct file *file, int mode,  		if (em->block_start == EXTENT_MAP_HOLE ||  		    (cur_offset >= inode->i_size &&  		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - -			/* -			 * Make sure we have enough space before we do the -			 * allocation. -			 */ -			ret = btrfs_check_data_free_space(inode, last_byte - -							  cur_offset); -			if (ret) { -				free_extent_map(em); -				break; -			} -  			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,  							last_byte - cur_offset,  							1 << inode->i_blkbits,  							offset + len,  							&alloc_hint); -			/* Let go of our reservation. */ -			btrfs_free_reserved_data_space(inode, last_byte - -						       cur_offset);  			if (ret < 0) {  				free_extent_map(em);  				break; @@ -1715,6 +1708,8 @@ static long btrfs_fallocate(struct file *file, int mode,  			     &cached_state, GFP_NOFS);  out:  	mutex_unlock(&inode->i_mutex); +	/* Let go of our reservation. */ +	btrfs_free_reserved_data_space(inode, len);  	return ret;  } @@ -1761,7 +1756,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)  						     start - root->sectorsize,  						     root->sectorsize, 0);  		if (IS_ERR(em)) { -			ret = -ENXIO; +			ret = PTR_ERR(em);  			goto out;  		}  		last_end = em->start + em->len; @@ -1773,7 +1768,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)  	while (1) {  		em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);  		if (IS_ERR(em)) { -			ret = -ENXIO; +			ret = PTR_ERR(em);  			break;  		} diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d20ff87ca60..b02e379b14c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -777,6 +777,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,  	spin_lock(&block_group->lock);  	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {  		spin_unlock(&block_group->lock); +		btrfs_free_path(path);  		goto out;  	}  	spin_unlock(&block_group->lock); @@ -1067,7 +1068,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,  		spin_unlock(&block_group->lock);  		ret = 0;  #ifdef DEBUG -		printk(KERN_ERR "btrfs: failed to write free space cace " +		printk(KERN_ERR "btrfs: failed to write free space cache "  		       "for block group %llu\n", block_group->key.objectid);  #endif  	} @@ -2242,7 +2243,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,  		if (entry->bitmap) {  			ret = btrfs_alloc_from_bitmap(block_group,  						      cluster, entry, bytes, -						      min_start); +						      cluster->window_start);  			if (ret == 0) {  				node = rb_next(&entry->offset_index);  				if (!node) @@ -2251,6 +2252,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,  						 offset_index);  				continue;  			} +			cluster->window_start += bytes;  		} else {  			ret = entry->offset; @@ -2475,7 +2477,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,  	}  	list_for_each_entry(entry, bitmaps, list) { -		if (entry->bytes < min_bytes) +		if (entry->bytes < bytes)  			continue;  		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,  					   bytes, cont1_bytes, min_bytes); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 213ffa86ce1..ee15d88b33d 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -438,7 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,  					  trans->bytes_reserved);  	if (ret)  		goto out; -	trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, +	trace_btrfs_space_reservation(root->fs_info, "ino_cache", +				      (u64)(unsigned long)trans,  				      trans->bytes_reserved, 1);  again:  	inode = lookup_free_ino_inode(root, path); @@ -500,7 +501,8 @@ again:  out_put:  	iput(inode);  out_release: -	trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, +	trace_btrfs_space_reservation(root->fs_info, "ino_cache", +				      (u64)(unsigned long)trans,  				      trans->bytes_reserved, 0);  	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);  out: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0da19a0ea00..3a0b5c1f9d3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -173,9 +173,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,  			cur_size = min_t(unsigned long, compressed_size,  				       PAGE_CACHE_SIZE); -			kaddr = kmap_atomic(cpage, KM_USER0); +			kaddr = kmap_atomic(cpage);  			write_extent_buffer(leaf, kaddr, ptr, cur_size); -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr);  			i++;  			ptr += cur_size; @@ -187,10 +187,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,  		page = find_get_page(inode->i_mapping,  				     start >> PAGE_CACHE_SHIFT);  		btrfs_set_file_extent_compression(leaf, ei, 0); -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = kmap_atomic(page);  		offset = start & (PAGE_CACHE_SIZE - 1);  		write_extent_buffer(leaf, kaddr + offset, ptr, size); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		page_cache_release(page);  	}  	btrfs_mark_buffer_dirty(leaf); @@ -422,10 +422,10 @@ again:  			 * sending it down to disk  			 */  			if (offset) { -				kaddr = kmap_atomic(page, KM_USER0); +				kaddr = kmap_atomic(page);  				memset(kaddr + offset, 0,  				       PAGE_CACHE_SIZE - offset); -				kunmap_atomic(kaddr, KM_USER0); +				kunmap_atomic(kaddr);  			}  			will_compress = 1;  		} @@ -1555,6 +1555,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)  	struct inode *inode;  	u64 page_start;  	u64 page_end; +	int ret;  	fixup = container_of(work, struct btrfs_writepage_fixup, work);  	page = fixup->page; @@ -1582,12 +1583,21 @@ again:  				     page_end, &cached_state, GFP_NOFS);  		unlock_page(page);  		btrfs_start_ordered_extent(inode, ordered, 1); +		btrfs_put_ordered_extent(ordered);  		goto again;  	} -	BUG(); +	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); +	if (ret) { +		mapping_set_error(page->mapping, ret); +		end_extent_writepage(page, ret, page_start, page_end); +		ClearPageChecked(page); +		goto out; +	 } +  	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);  	ClearPageChecked(page); +	set_page_dirty(page);  out:  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,  			     &cached_state, GFP_NOFS); @@ -1630,7 +1640,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)  	fixup->work.func = btrfs_writepage_fixup_worker;  	fixup->page = page;  	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); -	return -EAGAIN; +	return -EBUSY;  }  static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -1863,7 +1873,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,  	} else {  		ret = get_state_private(io_tree, start, &private);  	} -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	if (ret)  		goto zeroit; @@ -1872,7 +1882,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,  	if (csum != private)  		goto zeroit; -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  good:  	return 0; @@ -1884,7 +1894,7 @@ zeroit:  		       (unsigned long long)private);  	memset(kaddr + offset, 1, end - start + 1);  	flush_dcache_page(page); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	if (private == 0)  		return 0;  	return -EIO; @@ -4575,7 +4585,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,  		ret = btrfs_insert_dir_item(trans, root, name, name_len,  					    parent_inode, &key,  					    btrfs_inode_type(inode), index); -		BUG_ON(ret); +		if (ret) +			goto fail_dir_item;  		btrfs_i_size_write(parent_inode, parent_inode->i_size +  				   name_len * 2); @@ -4583,6 +4594,23 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,  		ret = btrfs_update_inode(trans, root, parent_inode);  	}  	return ret; + +fail_dir_item: +	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { +		u64 local_index; +		int err; +		err = btrfs_del_root_ref(trans, root->fs_info->tree_root, +				 key.objectid, root->root_key.objectid, +				 parent_ino, &local_index, name, name_len); + +	} else if (add_backref) { +		u64 local_index; +		int err; + +		err = btrfs_del_inode_ref(trans, root, name, name_len, +					  ino, parent_ino, &local_index); +	} +	return ret;  }  static int btrfs_add_nondir(struct btrfs_trans_handle *trans, @@ -4909,12 +4937,12 @@ static noinline int uncompress_inline(struct btrfs_path *path,  	ret = btrfs_decompress(compress_type, tmp, page,  			       extent_offset, inline_size, max_size);  	if (ret) { -		char *kaddr = kmap_atomic(page, KM_USER0); +		char *kaddr = kmap_atomic(page);  		unsigned long copy_size = min_t(u64,  				  PAGE_CACHE_SIZE - pg_offset,  				  max_size - extent_offset);  		memset(kaddr + pg_offset, 0, copy_size); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  	}  	kfree(tmp);  	return 0; @@ -5691,11 +5719,11 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)  			unsigned long flags;  			local_irq_save(flags); -			kaddr = kmap_atomic(page, KM_IRQ0); +			kaddr = kmap_atomic(page);  			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,  					       csum, bvec->bv_len);  			btrfs_csum_final(csum, (char *)&csum); -			kunmap_atomic(kaddr, KM_IRQ0); +			kunmap_atomic(kaddr);  			local_irq_restore(flags);  			flush_dcache_page(bvec->bv_page); @@ -6401,18 +6429,23 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	unsigned long zero_start;  	loff_t size;  	int ret; +	int reserved = 0;  	u64 page_start;  	u64 page_end;  	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); -	if (!ret) +	if (!ret) {  		ret = btrfs_update_time(vma->vm_file); +		reserved = 1; +	}  	if (ret) {  		if (ret == -ENOMEM)  			ret = VM_FAULT_OOM;  		else /* -ENOSPC, -EIO, etc */  			ret = VM_FAULT_SIGBUS; -		goto out; +		if (reserved) +			goto out; +		goto out_noreserve;  	}  	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ @@ -6495,6 +6528,7 @@ out_unlock:  	unlock_page(page);  out:  	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +out_noreserve:  	return ret;  } @@ -6690,8 +6724,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,  	int err;  	u64 index = 0; -	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, -				new_dirid, S_IFDIR | 0700, &index); +	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, +				new_dirid, new_dirid, +				S_IFDIR | (~current_umask() & S_IRWXUGO), +				&index);  	if (IS_ERR(inode))  		return PTR_ERR(inode);  	inode->i_op = &btrfs_dir_inode_operations; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ab620014bcc..d8b54715c2d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -861,6 +861,7 @@ static int cluster_pages_for_defrag(struct inode *inode,  	int i_done;  	struct btrfs_ordered_extent *ordered;  	struct extent_state *cached_state = NULL; +	struct extent_io_tree *tree;  	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);  	if (isize == 0) @@ -871,18 +872,34 @@ static int cluster_pages_for_defrag(struct inode *inode,  					   num_pages << PAGE_CACHE_SHIFT);  	if (ret)  		return ret; -again: -	ret = 0;  	i_done = 0; +	tree = &BTRFS_I(inode)->io_tree;  	/* step one, lock all the pages */  	for (i = 0; i < num_pages; i++) {  		struct page *page; +again:  		page = find_or_create_page(inode->i_mapping, -					    start_index + i, mask); +					   start_index + i, mask);  		if (!page)  			break; +		page_start = page_offset(page); +		page_end = page_start + PAGE_CACHE_SIZE - 1; +		while (1) { +			lock_extent(tree, page_start, page_end, GFP_NOFS); +			ordered = btrfs_lookup_ordered_extent(inode, +							      page_start); +			unlock_extent(tree, page_start, page_end, GFP_NOFS); +			if (!ordered) +				break; + +			unlock_page(page); +			btrfs_start_ordered_extent(inode, ordered, 1); +			btrfs_put_ordered_extent(ordered); +			lock_page(page); +		} +  		if (!PageUptodate(page)) {  			btrfs_readpage(NULL, page);  			lock_page(page); @@ -893,15 +910,22 @@ again:  				break;  			}  		} +  		isize = i_size_read(inode);  		file_end = (isize - 1) >> PAGE_CACHE_SHIFT; -		if (!isize || page->index > file_end || -		    page->mapping != inode->i_mapping) { +		if (!isize || page->index > file_end) {  			/* whoops, we blew past eof, skip this page */  			unlock_page(page);  			page_cache_release(page);  			break;  		} + +		if (page->mapping != inode->i_mapping) { +			unlock_page(page); +			page_cache_release(page); +			goto again; +		} +  		pages[i] = page;  		i_done++;  	} @@ -924,25 +948,6 @@ again:  	lock_extent_bits(&BTRFS_I(inode)->io_tree,  			 page_start, page_end - 1, 0, &cached_state,  			 GFP_NOFS); -	ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1); -	if (ordered && -	    ordered->file_offset + ordered->len > page_start && -	    ordered->file_offset < page_end) { -		btrfs_put_ordered_extent(ordered); -		unlock_extent_cached(&BTRFS_I(inode)->io_tree, -				     page_start, page_end - 1, -				     &cached_state, GFP_NOFS); -		for (i = 0; i < i_done; i++) { -			unlock_page(pages[i]); -			page_cache_release(pages[i]); -		} -		btrfs_wait_ordered_range(inode, page_start, -					 page_end - page_start); -		goto again; -	} -	if (ordered) -		btrfs_put_ordered_extent(ordered); -  	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,  			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |  			  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, @@ -1065,7 +1070,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  		i = range->start >> PAGE_CACHE_SHIFT;  	}  	if (!max_to_defrag) -		max_to_defrag = last_index; +		max_to_defrag = last_index + 1;  	/*  	 * make writeback starts from i, so the defrag range can be @@ -1327,6 +1332,12 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,  		goto out;  	} +	if (name[0] == '.' && +	   (namelen == 1 || (name[1] == '.' && namelen == 2))) { +		ret = -EEXIST; +		goto out; +	} +  	if (subvol) {  		ret = btrfs_mksubvol(&file->f_path, name, namelen,  				     NULL, transid, readonly); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index a178f5ebea7..743b86fa4fc 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -411,9 +411,9 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,  	bytes = min_t(unsigned long, destlen, out_len - start_byte); -	kaddr = kmap_atomic(dest_page, KM_USER0); +	kaddr = kmap_atomic(dest_page);  	memcpy(kaddr, workspace->buf + start_byte, bytes); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  out:  	return ret;  } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 2373b39a132..22db04550f6 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -305,7 +305,7 @@ again:  	spin_lock(&fs_info->reada_lock);  	ret = radix_tree_insert(&dev->reada_zones, -				(unsigned long)zone->end >> PAGE_CACHE_SHIFT, +				(unsigned long)(zone->end >> PAGE_CACHE_SHIFT),  				zone);  	spin_unlock(&fs_info->reada_lock); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9770cc5bfb7..390e7102b0f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -591,7 +591,7 @@ static int scrub_fixup_check(struct scrub_bio *sbio, int ix)  	u64 flags = sbio->spag[ix].flags;  	page = sbio->bio->bi_io_vec[ix].bv_page; -	buffer = kmap_atomic(page, KM_USER0); +	buffer = kmap_atomic(page);  	if (flags & BTRFS_EXTENT_FLAG_DATA) {  		ret = scrub_checksum_data(sbio->sdev,  					  sbio->spag + ix, buffer); @@ -603,7 +603,7 @@ static int scrub_fixup_check(struct scrub_bio *sbio, int ix)  	} else {  		WARN_ON(1);  	} -	kunmap_atomic(buffer, KM_USER0); +	kunmap_atomic(buffer);  	return ret;  } @@ -792,7 +792,7 @@ static void scrub_checksum(struct btrfs_work *work)  	}  	for (i = 0; i < sbio->count; ++i) {  		page = sbio->bio->bi_io_vec[i].bv_page; -		buffer = kmap_atomic(page, KM_USER0); +		buffer = kmap_atomic(page);  		flags = sbio->spag[i].flags;  		logical = sbio->logical + i * PAGE_SIZE;  		ret = 0; @@ -807,7 +807,7 @@ static void scrub_checksum(struct btrfs_work *work)  		} else {  			WARN_ON(1);  		} -		kunmap_atomic(buffer, KM_USER0); +		kunmap_atomic(buffer);  		if (ret) {  			ret = scrub_recheck_error(sbio, i);  			if (!ret) { @@ -1367,7 +1367,8 @@ out:  }  static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, -	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length) +	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, +	u64 dev_offset)  {  	struct btrfs_mapping_tree *map_tree =  		&sdev->dev->dev_root->fs_info->mapping_tree; @@ -1391,7 +1392,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,  		goto out;  	for (i = 0; i < map->num_stripes; ++i) { -		if (map->stripes[i].dev == sdev->dev) { +		if (map->stripes[i].dev == sdev->dev && +		    map->stripes[i].physical == dev_offset) {  			ret = scrub_stripe(sdev, map, i, chunk_offset, length);  			if (ret)  				goto out; @@ -1487,7 +1489,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)  			break;  		}  		ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, -				  chunk_offset, length); +				  chunk_offset, length, found_key.offset);  		btrfs_put_block_group(cache);  		if (ret)  			break; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3ce97b217cb..81df3fec6a6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -629,7 +629,6 @@ static int btrfs_fill_super(struct super_block *sb,  			    void *data, int silent)  {  	struct inode *inode; -	struct dentry *root_dentry;  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);  	struct btrfs_key key;  	int err; @@ -660,15 +659,12 @@ static int btrfs_fill_super(struct super_block *sb,  		goto fail_close;  	} -	root_dentry = d_alloc_root(inode); -	if (!root_dentry) { -		iput(inode); +	sb->s_root = d_make_root(inode); +	if (!sb->s_root) {  		err = -ENOMEM;  		goto fail_close;  	} -	sb->s_root = root_dentry; -  	save_mount_options(sb, data);  	cleancache_init_fs(sb);  	sb->s_flags |= MS_ACTIVE; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 287a6728b1a..04b77e3ceb7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -327,7 +327,8 @@ again:  	if (num_bytes) {  		trace_btrfs_space_reservation(root->fs_info, "transaction", -					      (u64)h, num_bytes, 1); +					      (u64)(unsigned long)h, +					      num_bytes, 1);  		h->block_rsv = &root->fs_info->trans_block_rsv;  		h->bytes_reserved = num_bytes;  	} @@ -915,7 +916,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,  				dentry->d_name.name, dentry->d_name.len,  				parent_inode, &key,  				BTRFS_FT_DIR, index); -	BUG_ON(ret); +	if (ret) { +		pending->error = -EEXIST; +		dput(parent); +		goto fail; +	}  	btrfs_i_size_write(parent_inode, parent_inode->i_size +  					 dentry->d_name.len * 2); @@ -993,12 +998,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,  {  	struct btrfs_pending_snapshot *pending;  	struct list_head *head = &trans->transaction->pending_snapshots; -	int ret; -	list_for_each_entry(pending, head, list) { -		ret = create_pending_snapshot(trans, fs_info, pending); -		BUG_ON(ret); -	} +	list_for_each_entry(pending, head, list) +		create_pending_snapshot(trans, fs_info, pending);  	return 0;  } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cb877e0886a..966cc74f5d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1957,7 +1957,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,  		finish_wait(&root->log_commit_wait[index], &wait);  		mutex_lock(&root->log_mutex); -	} while (root->log_transid < transid + 2 && +	} while (root->fs_info->last_trans_log_full_commit != +		 trans->transid && root->log_transid < transid + 2 &&  		 atomic_read(&root->log_commit[index]));  	return 0;  } @@ -1966,7 +1967,8 @@ static int wait_for_writer(struct btrfs_trans_handle *trans,  			   struct btrfs_root *root)  {  	DEFINE_WAIT(wait); -	while (atomic_read(&root->log_writers)) { +	while (root->fs_info->last_trans_log_full_commit != +	       trans->transid && atomic_read(&root->log_writers)) {  		prepare_to_wait(&root->log_writer_wait,  				&wait, TASK_UNINTERRUPTIBLE);  		mutex_unlock(&root->log_mutex); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0b4e2af7954..ef41f285a47 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -459,12 +459,23 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)  {  	struct btrfs_device *device, *next; +	struct block_device *latest_bdev = NULL; +	u64 latest_devid = 0; +	u64 latest_transid = 0; +  	mutex_lock(&uuid_mutex);  again:  	/* This is the initialized path, it is safe to release the devices. */  	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { -		if (device->in_fs_metadata) +		if (device->in_fs_metadata) { +			if (!latest_transid || +			    device->generation > latest_transid) { +				latest_devid = device->devid; +				latest_transid = device->generation; +				latest_bdev = device->bdev; +			}  			continue; +		}  		if (device->bdev) {  			blkdev_put(device->bdev, device->mode); @@ -487,6 +498,10 @@ again:  		goto again;  	} +	fs_devices->latest_bdev = latest_bdev; +	fs_devices->latest_devid = latest_devid; +	fs_devices->latest_trans = latest_transid; +  	mutex_unlock(&uuid_mutex);  	return 0;  } @@ -1953,7 +1968,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,  	em = lookup_extent_mapping(em_tree, chunk_offset, 1);  	read_unlock(&em_tree->lock); -	BUG_ON(em->start > chunk_offset || +	BUG_ON(!em || em->start > chunk_offset ||  	       em->start + em->len < chunk_offset);  	map = (struct map_lookup *)em->bdev; @@ -4356,6 +4371,20 @@ int btrfs_read_sys_array(struct btrfs_root *root)  		return -ENOMEM;  	btrfs_set_buffer_uptodate(sb);  	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); +	/* +	 * The sb extent buffer is artifical and just used to read the system array. +	 * btrfs_set_buffer_uptodate() call does not properly mark all it's +	 * pages up-to-date when the page is larger: extent does not cover the +	 * whole page and consequently check_page_uptodate does not find all +	 * the page's extents up-to-date (the hole beyond sb), +	 * write_extent_buffer then triggers a WARN_ON. +	 * +	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, +	 * but sb spans only this function. Add an explicit SetPageUptodate call +	 * to silence the warning eg. on PowerPC 64. +	 */ +	if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) +		SetPageUptodate(sb->first_page);  	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);  	array_size = btrfs_super_sys_array_size(super_copy); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index faccd47c6c4..92c20654cc5 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -370,9 +370,9 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,  			    PAGE_CACHE_SIZE - buf_offset);  		bytes = min(bytes, bytes_left); -		kaddr = kmap_atomic(dest_page, KM_USER0); +		kaddr = kmap_atomic(dest_page);  		memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		pg_offset += bytes;  		bytes_left -= bytes; diff --git a/fs/buffer.c b/fs/buffer.c index 1a30db77af3..70e2017edd7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -29,7 +29,7 @@  #include <linux/file.h>  #include <linux/quotaops.h>  #include <linux/highmem.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/writeback.h>  #include <linux/hash.h>  #include <linux/suspend.h> diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index a0358c2189c..7f0771d3894 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -646,7 +646,8 @@ lookup_again:  		 * (this is used to keep track of culling, and atimes are only  		 * updated by read, write and readdir but not lookup or  		 * open) */ -		touch_atime(cache->mnt, next); +		path.dentry = next; +		touch_atime(&path);  	}  	/* open a file interface onto a data file */ diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b60fc8bfb3e..620daad201d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap)  	unsigned long ttl;  	u32 gen; -	spin_lock(&cap->session->s_cap_lock); +	spin_lock(&cap->session->s_gen_ttl_lock);  	gen = cap->session->s_cap_gen;  	ttl = cap->session->s_cap_ttl; -	spin_unlock(&cap->session->s_cap_lock); +	spin_unlock(&cap->session->s_gen_ttl_lock);  	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {  		dout("__cap_is_valid %p cap %p issued %s " diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 618246bc219..3e8094be460 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -975,10 +975,10 @@ static int dentry_lease_is_valid(struct dentry *dentry)  	di = ceph_dentry(dentry);  	if (di->lease_session) {  		s = di->lease_session; -		spin_lock(&s->s_cap_lock); +		spin_lock(&s->s_gen_ttl_lock);  		gen = s->s_cap_gen;  		ttl = s->s_cap_ttl; -		spin_unlock(&s->s_cap_lock); +		spin_unlock(&s->s_gen_ttl_lock);  		if (di->lease_gen == gen &&  		    time_before(jiffies, dentry->d_time) && diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2c489378b4c..9fff9f3b17e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -677,18 +677,19 @@ static int fill_inode(struct inode *inode,  	case S_IFLNK:  		inode->i_op = &ceph_symlink_iops;  		if (!ci->i_symlink) { -			int symlen = iinfo->symlink_len; +			u32 symlen = iinfo->symlink_len;  			char *sym; -			BUG_ON(symlen != inode->i_size);  			spin_unlock(&ci->i_ceph_lock); +			err = -EINVAL; +			if (WARN_ON(symlen != inode->i_size)) +				goto out; +  			err = -ENOMEM; -			sym = kmalloc(symlen+1, GFP_NOFS); +			sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);  			if (!sym)  				goto out; -			memcpy(sym, iinfo->symlink, symlen); -			sym[symlen] = 0;  			spin_lock(&ci->i_ceph_lock);  			if (!ci->i_symlink) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 23ab6a3f182..89971e137aa 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,  	/* trace */  	ceph_decode_32_safe(&p, end, len, bad);  	if (len > 0) { +		ceph_decode_need(&p, end, len, bad);  		err = parse_reply_info_trace(&p, p+len, info, features);  		if (err < 0)  			goto out_bad; @@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg,  	/* extra */  	ceph_decode_32_safe(&p, end, len, bad);  	if (len > 0) { +		ceph_decode_need(&p, end, len, bad);  		err = parse_reply_info_extra(&p, p+len, info, features);  		if (err < 0)  			goto out_bad; @@ -398,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,  	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;  	s->s_con.peer_name.num = cpu_to_le64(mds); -	spin_lock_init(&s->s_cap_lock); +	spin_lock_init(&s->s_gen_ttl_lock);  	s->s_cap_gen = 0; -	s->s_cap_ttl = 0; +	s->s_cap_ttl = jiffies - 1; + +	spin_lock_init(&s->s_cap_lock);  	s->s_renew_requested = 0;  	s->s_renew_seq = 0;  	INIT_LIST_HEAD(&s->s_caps); @@ -1079,8 +1083,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,  	int wake = 0;  	spin_lock(&session->s_cap_lock); -	was_stale = is_renew && (session->s_cap_ttl == 0 || -				 time_after_eq(jiffies, session->s_cap_ttl)); +	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);  	session->s_cap_ttl = session->s_renew_requested +  		mdsc->mdsmap->m_session_timeout*HZ; @@ -2326,10 +2329,10 @@ static void handle_session(struct ceph_mds_session *session,  	case CEPH_SESSION_STALE:  		pr_info("mds%d caps went stale, renewing\n",  			session->s_mds); -		spin_lock(&session->s_cap_lock); +		spin_lock(&session->s_gen_ttl_lock);  		session->s_cap_gen++; -		session->s_cap_ttl = 0; -		spin_unlock(&session->s_cap_lock); +		session->s_cap_ttl = jiffies - 1; +		spin_unlock(&session->s_gen_ttl_lock);  		send_renew_caps(mdsc, session);  		break; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index a50ca0e3947..8c7c04ebb59 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -117,10 +117,13 @@ struct ceph_mds_session {  	void             *s_authorizer_buf, *s_authorizer_reply_buf;  	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len; -	/* protected by s_cap_lock */ -	spinlock_t        s_cap_lock; +	/* protected by s_gen_ttl_lock */ +	spinlock_t        s_gen_ttl_lock;  	u32               s_cap_gen;  /* inc each time we get mds stale msg */  	unsigned long     s_cap_ttl;  /* when session caps expire */ + +	/* protected by s_cap_lock */ +	spinlock_t        s_cap_lock;  	struct list_head  s_caps;     /* all caps issued by this session */  	int               s_nr_caps, s_trim_caps;  	int               s_num_cap_releases; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index a559c80f127..f04c0961f99 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -331,7 +331,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)  	/* alloc new snap context */  	err = -ENOMEM; -	if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc)) +	if (num > (ULONG_MAX - sizeof(*snapc)) / sizeof(u64))  		goto fail;  	snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);  	if (!snapc) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 00de2c9568c..1e67dd7305a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -130,10 +130,12 @@ enum {  	Opt_nodirstat,  	Opt_rbytes,  	Opt_norbytes, +	Opt_asyncreaddir,  	Opt_noasyncreaddir,  	Opt_dcache,  	Opt_nodcache,  	Opt_ino32, +	Opt_noino32,  };  static match_table_t fsopt_tokens = { @@ -153,10 +155,12 @@ static match_table_t fsopt_tokens = {  	{Opt_nodirstat, "nodirstat"},  	{Opt_rbytes, "rbytes"},  	{Opt_norbytes, "norbytes"}, +	{Opt_asyncreaddir, "asyncreaddir"},  	{Opt_noasyncreaddir, "noasyncreaddir"},  	{Opt_dcache, "dcache"},  	{Opt_nodcache, "nodcache"},  	{Opt_ino32, "ino32"}, +	{Opt_noino32, "noino32"},  	{-1, NULL}  }; @@ -232,6 +236,9 @@ static int parse_fsopt_token(char *c, void *private)  	case Opt_norbytes:  		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;  		break; +	case Opt_asyncreaddir: +		fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; +		break;  	case Opt_noasyncreaddir:  		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;  		break; @@ -244,6 +251,9 @@ static int parse_fsopt_token(char *c, void *private)  	case Opt_ino32:  		fsopt->flags |= CEPH_MOUNT_OPT_INO32;  		break; +	case Opt_noino32: +		fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; +		break;  	default:  		BUG_ON(token);  	} @@ -334,10 +344,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,  	*path += 2;  	dout("server path '%s'\n", *path); -	err = ceph_parse_options(popt, options, dev_name, dev_name_end, +	*popt = ceph_parse_options(options, dev_name, dev_name_end,  				 parse_fsopt_token, (void *)fsopt); -	if (err) +	if (IS_ERR(*popt)) { +		err = PTR_ERR(*popt);  		goto out; +	}  	/* success */  	*pfsopt = fsopt; @@ -655,9 +667,8 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,  		dout("open_root_inode success\n");  		if (ceph_ino(inode) == CEPH_INO_ROOT &&  		    fsc->sb->s_root == NULL) { -			root = d_alloc_root(inode); +			root = d_make_root(inode);  			if (!root) { -				iput(inode);  				root = ERR_PTR(-ENOMEM);  				goto out;  			} @@ -927,6 +938,7 @@ static int __init init_ceph(void)  	if (ret)  		goto out; +	ceph_xattr_init();  	ret = register_filesystem(&ceph_fs_type);  	if (ret)  		goto out_icache; @@ -936,6 +948,7 @@ static int __init init_ceph(void)  	return 0;  out_icache: +	ceph_xattr_exit();  	destroy_caches();  out:  	return ret; @@ -945,6 +958,7 @@ static void __exit exit_ceph(void)  {  	dout("exit_ceph\n");  	unregister_filesystem(&ceph_fs_type); +	ceph_xattr_exit();  	destroy_caches();  } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1421f3d875a..fc35036d258 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -367,7 +367,7 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)  	u32 ino = vino & 0xffffffff;  	ino ^= vino >> 32;  	if (!ino) -		ino = 1; +		ino = 2;  	return ino;  } @@ -733,6 +733,8 @@ extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);  extern int ceph_removexattr(struct dentry *, const char *);  extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);  extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); +extern void __init ceph_xattr_init(void); +extern void ceph_xattr_exit(void);  /* caps.c */  extern const char *ceph_cap_string(int c); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 857214ae8c0..35b86331d8a 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -8,9 +8,12 @@  #include <linux/xattr.h>  #include <linux/slab.h> +#define XATTR_CEPH_PREFIX "ceph." +#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) +  static bool ceph_is_valid_xattr(const char *name)  { -	return !strncmp(name, "ceph.", 5) || +	return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||  	       !strncmp(name, XATTR_SECURITY_PREFIX,  			XATTR_SECURITY_PREFIX_LEN) ||  	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || @@ -21,79 +24,91 @@ static bool ceph_is_valid_xattr(const char *name)   * These define virtual xattrs exposing the recursive directory   * statistics and layout metadata.   */ -struct ceph_vxattr_cb { -	bool readonly; +struct ceph_vxattr {  	char *name; +	size_t name_size;	/* strlen(name) + 1 (for '\0') */  	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,  			      size_t size); +	bool readonly;  };  /* directories */ -static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,  					size_t size)  {  	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);  } -static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val,  				      size_t size)  {  	return snprintf(val, size, "%lld", ci->i_files);  } -static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val,  					size_t size)  {  	return snprintf(val, size, "%lld", ci->i_subdirs);  } -static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val,  					 size_t size)  {  	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);  } -static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val,  				       size_t size)  {  	return snprintf(val, size, "%lld", ci->i_rfiles);  } -static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val,  					 size_t size)  {  	return snprintf(val, size, "%lld", ci->i_rsubdirs);  } -static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val,  				       size_t size)  {  	return snprintf(val, size, "%lld", ci->i_rbytes);  } -static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,  				       size_t size)  { -	return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, +	return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec,  			(long)ci->i_rctime.tv_nsec);  } -static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { -	{ true, "ceph.dir.entries", ceph_vxattrcb_entries}, -	{ true, "ceph.dir.files", ceph_vxattrcb_files}, -	{ true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, -	{ true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, -	{ true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, -	{ true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, -	{ true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, -	{ true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, -	{ true, NULL, NULL } +#define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name + +#define XATTR_NAME_CEPH(_type, _name) \ +		{ \ +			.name = CEPH_XATTR_NAME(_type, _name), \ +			.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ +			.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ +			.readonly = true, \ +		} + +static struct ceph_vxattr ceph_dir_vxattrs[] = { +	XATTR_NAME_CEPH(dir, entries), +	XATTR_NAME_CEPH(dir, files), +	XATTR_NAME_CEPH(dir, subdirs), +	XATTR_NAME_CEPH(dir, rentries), +	XATTR_NAME_CEPH(dir, rfiles), +	XATTR_NAME_CEPH(dir, rsubdirs), +	XATTR_NAME_CEPH(dir, rbytes), +	XATTR_NAME_CEPH(dir, rctime), +	{ 0 }	/* Required table terminator */  }; +static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */  /* files */ -static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,  				   size_t size)  {  	int ret; @@ -103,19 +118,32 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,  		(unsigned long long)ceph_file_layout_su(ci->i_layout),  		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),  		(unsigned long long)ceph_file_layout_object_size(ci->i_layout)); -	if (ceph_file_layout_pg_preferred(ci->i_layout)) -		ret += snprintf(val + ret, size, "preferred_osd=%lld\n", + +	if (ceph_file_layout_pg_preferred(ci->i_layout) >= 0) { +		val += ret; +		size -= ret; +		ret += snprintf(val, size, "preferred_osd=%lld\n",  			    (unsigned long long)ceph_file_layout_pg_preferred(  				    ci->i_layout)); +	} +  	return ret;  } -static struct ceph_vxattr_cb ceph_file_vxattrs[] = { -	{ true, "ceph.layout", ceph_vxattrcb_layout}, -	{ NULL, NULL } +static struct ceph_vxattr ceph_file_vxattrs[] = { +	XATTR_NAME_CEPH(file, layout), +	/* The following extended attribute name is deprecated */ +	{ +		.name = XATTR_CEPH_PREFIX "layout", +		.name_size = sizeof (XATTR_CEPH_PREFIX "layout"), +		.getxattr_cb = ceph_vxattrcb_file_layout, +		.readonly = true, +	}, +	{ 0 }	/* Required table terminator */  }; +static size_t ceph_file_vxattrs_name_size;	/* total size of all names */ -static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) +static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)  {  	if (S_ISDIR(inode->i_mode))  		return ceph_dir_vxattrs; @@ -124,14 +152,59 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)  	return NULL;  } -static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, +static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ +	if (vxattrs == ceph_dir_vxattrs) +		return ceph_dir_vxattrs_name_size; +	if (vxattrs == ceph_file_vxattrs) +		return ceph_file_vxattrs_name_size; +	BUG(); + +	return 0; +} + +/* + * Compute the aggregate size (including terminating '\0') of all + * virtual extended attribute names in the given vxattr table. + */ +static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ +	struct ceph_vxattr *vxattr; +	size_t size = 0; + +	for (vxattr = vxattrs; vxattr->name; vxattr++) +		size += vxattr->name_size; + +	return size; +} + +/* Routines called at initialization and exit time */ + +void __init ceph_xattr_init(void) +{ +	ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs); +	ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs); +} + +void ceph_xattr_exit(void) +{ +	ceph_dir_vxattrs_name_size = 0; +	ceph_file_vxattrs_name_size = 0; +} + +static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,  						const char *name)  { -	do { -		if (strcmp(vxattr->name, name) == 0) -			return vxattr; -		vxattr++; -	} while (vxattr->name); +	struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode); + +	if (vxattr) { +		while (vxattr->name) { +			if (!strcmp(vxattr->name, name)) +				return vxattr; +			vxattr++; +		} +	} +  	return NULL;  } @@ -500,17 +573,15 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,  {  	struct inode *inode = dentry->d_inode;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);  	int err;  	struct ceph_inode_xattr *xattr; -	struct ceph_vxattr_cb *vxattr = NULL; +	struct ceph_vxattr *vxattr = NULL;  	if (!ceph_is_valid_xattr(name))  		return -ENODATA;  	/* let's see if a virtual xattr was requested */ -	if (vxattrs) -		vxattr = ceph_match_vxattr(vxattrs, name); +	vxattr = ceph_match_vxattr(inode, name);  	spin_lock(&ci->i_ceph_lock);  	dout("getxattr %p ver=%lld index_ver=%lld\n", inode, @@ -566,7 +637,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)  {  	struct inode *inode = dentry->d_inode;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); +	struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);  	u32 vir_namelen = 0;  	u32 namelen;  	int err; @@ -594,11 +665,12 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)  		goto out;  list_xattr: -	vir_namelen = 0; -	/* include virtual dir xattrs */ -	if (vxattrs) -		for (i = 0; vxattrs[i].name; i++) -			vir_namelen += strlen(vxattrs[i].name) + 1; +	/* +	 * Start with virtual dir xattr names (if any) (including +	 * terminating '\0' characters for each). +	 */ +	vir_namelen = ceph_vxattrs_name_size(vxattrs); +  	/* adding 1 byte per each variable due to the null termination */  	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;  	err = -ERANGE; @@ -696,17 +768,17 @@ int ceph_setxattr(struct dentry *dentry, const char *name,  		  const void *value, size_t size, int flags)  {  	struct inode *inode = dentry->d_inode; +	struct ceph_vxattr *vxattr;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); +	int issued;  	int err; +	int dirty;  	int name_len = strlen(name);  	int val_len = size;  	char *newname = NULL;  	char *newval = NULL;  	struct ceph_inode_xattr *xattr = NULL; -	int issued;  	int required_blob_size; -	int dirty;  	if (ceph_snap(inode) != CEPH_NOSNAP)  		return -EROFS; @@ -714,12 +786,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,  	if (!ceph_is_valid_xattr(name))  		return -EOPNOTSUPP; -	if (vxattrs) { -		struct ceph_vxattr_cb *vxattr = -			ceph_match_vxattr(vxattrs, name); -		if (vxattr && vxattr->readonly) -			return -EOPNOTSUPP; -	} +	vxattr = ceph_match_vxattr(inode, name); +	if (vxattr && vxattr->readonly) +		return -EOPNOTSUPP;  	/* preallocate memory for xattr name, value, index node */  	err = -ENOMEM; @@ -728,11 +797,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,  		goto out;  	if (val_len) { -		newval = kmalloc(val_len + 1, GFP_NOFS); +		newval = kmemdup(value, val_len, GFP_NOFS);  		if (!newval)  			goto out; -		memcpy(newval, value, val_len); -		newval[val_len] = '\0';  	}  	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); @@ -742,6 +809,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,  	spin_lock(&ci->i_ceph_lock);  retry:  	issued = __ceph_caps_issued(ci, NULL); +	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));  	if (!(issued & CEPH_CAP_XATTR_EXCL))  		goto do_sync;  	__build_xattrs(inode); @@ -750,7 +818,7 @@ retry:  	if (!ci->i_xattrs.prealloc_blob ||  	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { -		struct ceph_buffer *blob = NULL; +		struct ceph_buffer *blob;  		spin_unlock(&ci->i_ceph_lock);  		dout(" preaallocating new blob size=%d\n", required_blob_size); @@ -764,12 +832,13 @@ retry:  		goto retry;  	} -	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));  	err = __set_xattr(ci, newname, name_len, newval,  			  val_len, 1, 1, 1, &xattr); +  	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);  	ci->i_xattrs.dirty = true;  	inode->i_ctime = CURRENT_TIME; +  	spin_unlock(&ci->i_ceph_lock);  	if (dirty)  		__mark_inode_dirty(inode, dirty); @@ -814,8 +883,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)  int ceph_removexattr(struct dentry *dentry, const char *name)  {  	struct inode *inode = dentry->d_inode; +	struct ceph_vxattr *vxattr;  	struct ceph_inode_info *ci = ceph_inode(inode); -	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);  	int issued;  	int err;  	int required_blob_size; @@ -827,22 +896,19 @@ int ceph_removexattr(struct dentry *dentry, const char *name)  	if (!ceph_is_valid_xattr(name))  		return -EOPNOTSUPP; -	if (vxattrs) { -		struct ceph_vxattr_cb *vxattr = -			ceph_match_vxattr(vxattrs, name); -		if (vxattr && vxattr->readonly) -			return -EOPNOTSUPP; -	} +	vxattr = ceph_match_vxattr(inode, name); +	if (vxattr && vxattr->readonly) +		return -EOPNOTSUPP;  	err = -ENOMEM;  	spin_lock(&ci->i_ceph_lock); -	__build_xattrs(inode);  retry:  	issued = __ceph_caps_issued(ci, NULL);  	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));  	if (!(issued & CEPH_CAP_XATTR_EXCL))  		goto do_sync; +	__build_xattrs(inode);  	required_blob_size = __get_required_blob_size(ci, 0, 0); @@ -863,10 +929,10 @@ retry:  	}  	err = __remove_xattr_by_name(ceph_inode(inode), name); +  	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);  	ci->i_xattrs.dirty = true;  	inode->i_ctime = CURRENT_TIME; -  	spin_unlock(&ci->i_ceph_lock);  	if (dirty)  		__mark_inode_dirty(inode, dirty); diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index f66cc162515..2b243af70aa 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -139,8 +139,7 @@ config CIFS_DFS_UPCALL  	    points. If unsure, say N.  config CIFS_FSCACHE -	  bool "Provide CIFS client caching support (EXPERIMENTAL)" -	  depends on EXPERIMENTAL +	  bool "Provide CIFS client caching support"  	  depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y  	  help  	    Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data @@ -148,8 +147,8 @@ config CIFS_FSCACHE  	    manager. If unsure, say N.  config CIFS_ACL -	  bool "Provide CIFS ACL support (EXPERIMENTAL)" -	  depends on EXPERIMENTAL && CIFS_XATTR && KEYS +	  bool "Provide CIFS ACL support" +	  depends on CIFS_XATTR && KEYS  	  help  	    Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob  	    is handed over to the application/caller. diff --git a/fs/cifs/README b/fs/cifs/README index 895da1dc155..b7d782bab79 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -753,10 +753,6 @@ module loading or during the runtime by using the interface  i.e. echo "value" > /sys/module/cifs/parameters/<param> -1. echo_retries - The number of echo attempts before giving up and -		  reconnecting to the server. The default is 5. The value 0 -		  means never reconnect. - -2. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default. +1. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.  		    [Y/y/1]. To disable use any of [N/n/0]. diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 84e8c072470..573b899b5a5 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -171,8 +171,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)  			seq_printf(m, "TCP status: %d\n\tLocal Users To "  				   "Server: %d SecMode: 0x%x Req On Wire: %d",  				   server->tcpStatus, server->srv_count, -				   server->sec_mode, -				   atomic_read(&server->inFlight)); +				   server->sec_mode, in_flight(server));  #ifdef CONFIG_CIFS_STATS2  			seq_printf(m, " In Send: %d In MaxReq Wait: %d", @@ -676,14 +675,23 @@ static ssize_t cifs_multiuser_mount_proc_write(struct file *file,  {  	char c;  	int rc; +	static bool warned;  	rc = get_user(c, buffer);  	if (rc)  		return rc;  	if (c == '0' || c == 'n' || c == 'N')  		multiuser_mount = 0; -	else if (c == '1' || c == 'y' || c == 'Y') +	else if (c == '1' || c == 'y' || c == 'Y') {  		multiuser_mount = 1; +		if (!warned) { +			warned = true; +			printk(KERN_WARNING "CIFS VFS: The legacy multiuser " +				"mount code is scheduled to be deprecated in " +				"3.5. Please switch to using the multiuser " +				"mount option."); +		} +	}  	return count;  } diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 2272fd5fe5b..e622863b292 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -113,9 +113,11 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)  		   MAX_MECH_STR_LEN +  		   UID_KEY_LEN + (sizeof(uid_t) * 2) +  		   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) + -		   USER_KEY_LEN + strlen(sesInfo->user_name) +  		   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; +	if (sesInfo->user_name) +		desc_len += USER_KEY_LEN + strlen(sesInfo->user_name); +  	spnego_key = ERR_PTR(-ENOMEM);  	description = kzalloc(desc_len, GFP_KERNEL);  	if (description == NULL) @@ -152,8 +154,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)  	dp = description + strlen(description);  	sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); -	dp = description + strlen(description); -	sprintf(dp, ";user=%s", sesInfo->user_name); +	if (sesInfo->user_name) { +		dp = description + strlen(description); +		sprintf(dp, ";user=%s", sesInfo->user_name); +	}  	dp = description + strlen(description);  	sprintf(dp, ";pid=0x%x", current->pid); diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 1b2e180b018..fbb9da95184 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -27,17 +27,17 @@  #include "cifs_debug.h"  /* - * cifs_ucs2_bytes - how long will a string be after conversion? - * @ucs - pointer to input string + * cifs_utf16_bytes - how long will a string be after conversion? + * @utf16 - pointer to input string   * @maxbytes - don't go past this many bytes of input string   * @codepage - destination codepage   * - * Walk a ucs2le string and return the number of bytes that the string will + * Walk a utf16le string and return the number of bytes that the string will   * be after being converted to the given charset, not including any null   * termination required. Don't walk past maxbytes in the source buffer.   */  int -cifs_ucs2_bytes(const __le16 *from, int maxbytes, +cifs_utf16_bytes(const __le16 *from, int maxbytes,  		const struct nls_table *codepage)  {  	int i; @@ -122,7 +122,7 @@ cp_convert:  }  /* - * cifs_from_ucs2 - convert utf16le string to local charset + * cifs_from_utf16 - convert utf16le string to local charset   * @to - destination buffer   * @from - source buffer   * @tolen - destination buffer size (in bytes) @@ -130,7 +130,7 @@ cp_convert:   * @codepage - codepage to which characters should be converted   * @mapchar - should characters be remapped according to the mapchars option?   * - * Convert a little-endian ucs2le string (as sent by the server) to a string + * Convert a little-endian utf16le string (as sent by the server) to a string   * in the provided codepage. The tolen and fromlen parameters are to ensure   * that the code doesn't walk off of the end of the buffer (which is always   * a danger if the alignment of the source buffer is off). The destination @@ -139,12 +139,12 @@ cp_convert:   * null terminator).   *   * Note that some windows versions actually send multiword UTF-16 characters - * instead of straight UCS-2. The linux nls routines however aren't able to + * instead of straight UTF16-2. The linux nls routines however aren't able to   * deal with those characters properly. In the event that we get some of   * those characters, they won't be translated properly.   */  int -cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen, +cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,  		 const struct nls_table *codepage, bool mapchar)  {  	int i, charlen, safelen; @@ -190,13 +190,13 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,  }  /* - * NAME:	cifs_strtoUCS() + * NAME:	cifs_strtoUTF16()   *   * FUNCTION:	Convert character string to unicode string   *   */  int -cifs_strtoUCS(__le16 *to, const char *from, int len, +cifs_strtoUTF16(__le16 *to, const char *from, int len,  	      const struct nls_table *codepage)  {  	int charlen; @@ -206,7 +206,7 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,  	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {  		charlen = codepage->char2uni(from, len, &wchar_to);  		if (charlen < 1) { -			cERROR(1, "strtoUCS: char2uni of 0x%x returned %d", +			cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d",  				*from, charlen);  			/* A question mark */  			wchar_to = 0x003f; @@ -220,7 +220,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,  }  /* - * cifs_strndup_from_ucs - copy a string from wire format to the local codepage + * cifs_strndup_from_utf16 - copy a string from wire format to the local + * codepage   * @src - source string   * @maxlen - don't walk past this many bytes in the source string   * @is_unicode - is this a unicode string? @@ -231,19 +232,19 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,   * error.   */  char * -cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode, -	     const struct nls_table *codepage) +cifs_strndup_from_utf16(const char *src, const int maxlen, +			const bool is_unicode, const struct nls_table *codepage)  {  	int len;  	char *dst;  	if (is_unicode) { -		len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage); +		len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);  		len += nls_nullsize(codepage);  		dst = kmalloc(len, GFP_KERNEL);  		if (!dst)  			return NULL; -		cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage, +		cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,  			       false);  	} else {  		len = strnlen(src, maxlen); @@ -264,7 +265,7 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,   * names are little endian 16 bit Unicode on the wire   */  int -cifsConvertToUCS(__le16 *target, const char *source, int srclen, +cifsConvertToUTF16(__le16 *target, const char *source, int srclen,  		 const struct nls_table *cp, int mapChars)  {  	int i, j, charlen; @@ -273,7 +274,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,  	wchar_t tmp;  	if (!mapChars) -		return cifs_strtoUCS(target, source, PATH_MAX, cp); +		return cifs_strtoUTF16(target, source, PATH_MAX, cp);  	for (i = 0, j = 0; i < srclen; j++) {  		src_char = source[i]; @@ -281,7 +282,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,  		switch (src_char) {  		case 0:  			put_unaligned(0, &target[j]); -			goto ctoUCS_out; +			goto ctoUTF16_out;  		case ':':  			dst_char = cpu_to_le16(UNI_COLON);  			break; @@ -326,7 +327,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,  		put_unaligned(dst_char, &target[j]);  	} -ctoUCS_out: +ctoUTF16_out:  	return i;  } diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 6d02fd56056..a513a546700 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -74,16 +74,16 @@ extern const struct UniCaseRange CifsUniLowerRange[];  #endif				/* UNIUPR_NOLOWER */  #ifdef __KERNEL__ -int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen, -		   const struct nls_table *codepage, bool mapchar); -int cifs_ucs2_bytes(const __le16 *from, int maxbytes, -		    const struct nls_table *codepage); -int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *); -char *cifs_strndup_from_ucs(const char *src, const int maxlen, -			    const bool is_unicode, -			    const struct nls_table *codepage); -extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen, -			const struct nls_table *cp, int mapChars); +int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, +		    const struct nls_table *codepage, bool mapchar); +int cifs_utf16_bytes(const __le16 *from, int maxbytes, +		     const struct nls_table *codepage); +int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *); +char *cifs_strndup_from_utf16(const char *src, const int maxlen, +			      const bool is_unicode, +			      const struct nls_table *codepage); +extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen, +			      const struct nls_table *cp, int mapChars);  #endif diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 72ddf23ef6f..3cc1b251ca0 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -556,6 +556,7 @@ init_cifs_idmap(void)  	/* instruct request_key() to use this special keyring as a cache for  	 * the results it looks up */ +	set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);  	cred->thread_keyring = keyring;  	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;  	root_cred = cred; @@ -909,6 +910,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,  		umode_t group_mask = S_IRWXG;  		umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; +		if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *)) +			return;  		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),  				GFP_KERNEL);  		if (!ppace) { diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 5d9b9acc5fc..63c460e503b 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -327,7 +327,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)  	attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);  	attrptr->length = cpu_to_le16(2 * dlen);  	blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); -	cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp); +	cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp);  	return 0;  } @@ -376,7 +376,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)  					kmalloc(attrsize + 1, GFP_KERNEL);  				if (!ses->domainName)  						return -ENOMEM; -				cifs_from_ucs2(ses->domainName, +				cifs_from_utf16(ses->domainName,  					(__le16 *)blobptr, attrsize, attrsize,  					nls_cp, false);  				break; @@ -420,15 +420,20 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,  	}  	/* convert ses->user_name to unicode and uppercase */ -	len = strlen(ses->user_name); +	len = ses->user_name ? strlen(ses->user_name) : 0;  	user = kmalloc(2 + (len * 2), GFP_KERNEL);  	if (user == NULL) {  		cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");  		rc = -ENOMEM;  		return rc;  	} -	len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp); -	UniStrupr(user); + +	if (len) { +		len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp); +		UniStrupr(user); +	} else { +		memset(user, '\0', 2); +	}  	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,  				(char *)user, 2 * len); @@ -448,8 +453,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,  			rc = -ENOMEM;  			return rc;  		} -		len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, -					nls_cp); +		len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len, +				      nls_cp);  		rc =  		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,  					(char *)domain, 2 * len); @@ -468,7 +473,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,  			rc = -ENOMEM;  			return rc;  		} -		len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, +		len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len,  					nls_cp);  		rc =  		crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index b1fd382d195..eee522c56ef 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -76,12 +76,7 @@ MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "  unsigned int cifs_max_pending = CIFS_MAX_REQ;  module_param(cifs_max_pending, int, 0444);  MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " -				   "Default: 50 Range: 2 to 256"); -unsigned short echo_retries = 5; -module_param(echo_retries, ushort, 0644); -MODULE_PARM_DESC(echo_retries, "Number of echo attempts before giving up and " -			       "reconnecting server. Default: 5. 0 means " -			       "never reconnect."); +				   "Default: 32767 Range: 2 to 32767.");  module_param(enable_oplocks, bool, 0644);  MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"  				 "y/Y/1"); @@ -119,12 +114,10 @@ cifs_read_super(struct super_block *sb)  	if (IS_ERR(inode)) {  		rc = PTR_ERR(inode); -		inode = NULL;  		goto out_no_root;  	} -	sb->s_root = d_alloc_root(inode); - +	sb->s_root = d_make_root(inode);  	if (!sb->s_root) {  		rc = -ENOMEM;  		goto out_no_root; @@ -147,9 +140,6 @@ cifs_read_super(struct super_block *sb)  out_no_root:  	cERROR(1, "cifs_read_super: get root inode failed"); -	if (inode) -		iput(inode); -  	return rc;  } @@ -1116,9 +1106,9 @@ init_cifs(void)  	if (cifs_max_pending < 2) {  		cifs_max_pending = 2;  		cFYI(1, "cifs_max_pending set to min of 2"); -	} else if (cifs_max_pending > 256) { -		cifs_max_pending = 256; -		cFYI(1, "cifs_max_pending set to max of 256"); +	} else if (cifs_max_pending > CIFS_MAX_REQ) { +		cifs_max_pending = CIFS_MAX_REQ; +		cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ);  	}  	rc = cifs_fscache_register(); @@ -1180,11 +1170,8 @@ static void __exit  exit_cifs(void)  {  	cFYI(DBG2, "exit_cifs"); -	cifs_proc_clean(); -	cifs_fscache_unregister(); -#ifdef CONFIG_CIFS_DFS_UPCALL +	unregister_filesystem(&cifs_fs_type);  	cifs_dfs_release_automount_timer(); -#endif  #ifdef CONFIG_CIFS_ACL  	cifs_destroy_idmaptrees();  	exit_cifs_idmap(); @@ -1192,10 +1179,11 @@ exit_cifs(void)  #ifdef CONFIG_CIFS_UPCALL  	unregister_key_type(&cifs_spnego_key_type);  #endif -	unregister_filesystem(&cifs_fs_type); -	cifs_destroy_inodecache(); -	cifs_destroy_mids();  	cifs_destroy_request_bufs(); +	cifs_destroy_mids(); +	cifs_destroy_inodecache(); +	cifs_fscache_unregister(); +	cifs_proc_clean();  }  MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ba53c1c6c6c..339ebe3ebc0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -55,14 +55,9 @@  /*   * MAX_REQ is the maximum number of requests that WE will send - * on one socket concurrently. It also matches the most common - * value of max multiplex returned by servers.  We may - * eventually want to use the negotiated value (in case - * future servers can handle more) when we are more confident that - * we will not have problems oveloading the socket with pending - * write data. + * on one socket concurrently.   */ -#define CIFS_MAX_REQ 50 +#define CIFS_MAX_REQ 32767  #define RFC1001_NAME_LEN 15  #define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1) @@ -255,7 +250,9 @@ struct TCP_Server_Info {  	bool noblocksnd;		/* use blocking sendmsg */  	bool noautotune;		/* do not autotune send buf sizes */  	bool tcp_nodelay; -	atomic_t inFlight;  /* number of requests on the wire to server */ +	int credits;  /* send no more requests at once */ +	unsigned int in_flight;  /* number of requests on the wire to server */ +	spinlock_t req_lock;  /* protect the two values above */  	struct mutex srv_mutex;  	struct task_struct *tsk;  	char server_GUID[16]; @@ -263,6 +260,7 @@ struct TCP_Server_Info {  	bool session_estab; /* mark when very first sess is established */  	u16 dialect; /* dialect index that server chose */  	enum securityEnum secType; +	bool oplocks:1; /* enable oplocks */  	unsigned int maxReq;	/* Clients should submit no more */  	/* than maxReq distinct unanswered SMBs to the server when using  */  	/* multiplexed reads or writes */ @@ -307,6 +305,36 @@ struct TCP_Server_Info {  #endif  }; +static inline unsigned int +in_flight(struct TCP_Server_Info *server) +{ +	unsigned int num; +	spin_lock(&server->req_lock); +	num = server->in_flight; +	spin_unlock(&server->req_lock); +	return num; +} + +static inline int* +get_credits_field(struct TCP_Server_Info *server) +{ +	/* +	 * This will change to switch statement when we reserve slots for echos +	 * and oplock breaks. +	 */ +	return &server->credits; +} + +static inline bool +has_credits(struct TCP_Server_Info *server, int *credits) +{ +	int num; +	spin_lock(&server->req_lock); +	num = *credits; +	spin_unlock(&server->req_lock); +	return num > 0; +} +  /*   * Macros to allow the TCP_Server_Info->net field and related code to drop out   * when CONFIG_NET_NS isn't set. @@ -879,6 +907,8 @@ require use of the stronger protocol */  #define   CIFSSEC_MASK          0xB70B7 /* current flags supported if weak */  #endif /* UPCALL */  #else /* do not allow weak pw hash */ +#define   CIFSSEC_MUST_LANMAN	0 +#define   CIFSSEC_MUST_PLNTXT	0  #ifdef CONFIG_CIFS_UPCALL  #define   CIFSSEC_MASK          0x8F08F /* flags supported if no weak allowed */  #else @@ -1008,9 +1038,6 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */  GLOBAL_EXTERN unsigned int cifs_min_small;  /* min size of small buf pool */  GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ -/* reconnect after this many failed echo attempts */ -GLOBAL_EXTERN unsigned short echo_retries; -  #ifdef CONFIG_CIFS_ACL  GLOBAL_EXTERN struct rb_root uidtree;  GLOBAL_EXTERN struct rb_root gidtree; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 6f4e243e0f6..503e73d8bdb 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -88,6 +88,9 @@ extern int SendReceiveBlockingLock(const unsigned int xid,  			struct smb_hdr *in_buf ,  			struct smb_hdr *out_buf,  			int *bytes_returned); +extern void cifs_add_credits(struct TCP_Server_Info *server, +			     const unsigned int add); +extern void cifs_set_credits(struct TCP_Server_Info *server, const int val);  extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);  extern bool is_valid_oplock_break(struct smb_hdr *smb,  				  struct TCP_Server_Info *); @@ -168,7 +171,13 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data,  					    const char *devname);  extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *);  extern void cifs_umount(struct cifs_sb_info *); + +#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)  extern void cifs_dfs_release_automount_timer(void); +#else /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */ +#define cifs_dfs_release_automount_timer()	do { } while (0) +#endif /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */ +  void cifs_proc_init(void);  void cifs_proc_clean(void); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6600aa2d2ef..70aac35c398 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -458,7 +458,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)  			goto neg_err_exit;  		}  		server->sec_mode = (__u8)le16_to_cpu(rsp->SecurityMode); -		server->maxReq = le16_to_cpu(rsp->MaxMpxCount); +		server->maxReq = min_t(unsigned int, +				       le16_to_cpu(rsp->MaxMpxCount), +				       cifs_max_pending); +		cifs_set_credits(server, server->maxReq);  		server->maxBuf = le16_to_cpu(rsp->MaxBufSize);  		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);  		/* even though we do not use raw we might as well set this @@ -564,7 +567,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses)  	/* one byte, so no need to convert this or EncryptionKeyLen from  	   little endian */ -	server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); +	server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount), +			       cifs_max_pending); +	cifs_set_credits(server, server->maxReq);  	/* probably no need to store and check maxvcs */  	server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);  	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); @@ -716,8 +721,7 @@ cifs_echo_callback(struct mid_q_entry *mid)  	struct TCP_Server_Info *server = mid->callback_data;  	DeleteMidQEntry(mid); -	atomic_dec(&server->inFlight); -	wake_up(&server->request_q); +	cifs_add_credits(server, 1);  }  int @@ -821,8 +825,8 @@ PsxDelete:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, -				     PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, +				       PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else { /* BB add path length overrun check */ @@ -893,8 +897,8 @@ DelFileRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->fileName, fileName, -				     PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->fileName, fileName, +				       PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {		/* BB improve check for buffer overruns BB */ @@ -938,8 +942,8 @@ RmDirRetry:  		return rc;  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { -		name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName, -					 PATH_MAX, nls_codepage, remap); +		name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName, +					      PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {		/* BB improve check for buffer overruns BB */ @@ -981,8 +985,8 @@ MkDirRetry:  		return rc;  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { -		name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name, -					    PATH_MAX, nls_codepage, remap); +		name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name, +					      PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {		/* BB improve check for buffer overruns BB */ @@ -1030,8 +1034,8 @@ PsxCreat:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->FileName, name, -				     PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->FileName, name, +				       PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -1197,8 +1201,8 @@ OldOpenRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		count = 1;      /* account for one byte pad to word boundary */  		name_len = -		   cifsConvertToUCS((__le16 *) (pSMB->fileName + 1), -				    fileName, PATH_MAX, nls_codepage, remap); +		   cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1), +				      fileName, PATH_MAX, nls_codepage, remap);  		name_len++;     /* trailing null */  		name_len *= 2;  	} else {                /* BB improve check for buffer overruns BB */ @@ -1304,8 +1308,8 @@ openRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		count = 1;	/* account for one byte pad to word boundary */  		name_len = -		    cifsConvertToUCS((__le16 *) (pSMB->fileName + 1), -				     fileName, PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1), +				       fileName, PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  		pSMB->NameLength = cpu_to_le16(name_len); @@ -1669,8 +1673,7 @@ cifs_readv_callback(struct mid_q_entry *mid)  	queue_work(system_nrt_wq, &rdata->work);  	DeleteMidQEntry(mid); -	atomic_dec(&server->inFlight); -	wake_up(&server->request_q); +	cifs_add_credits(server, 1);  }  /* cifs_async_readv - send an async write, and set up mid to handle result */ @@ -2110,8 +2113,7 @@ cifs_writev_callback(struct mid_q_entry *mid)  	queue_work(system_nrt_wq, &wdata->work);  	DeleteMidQEntry(mid); -	atomic_dec(&tcon->ses->server->inFlight); -	wake_up(&tcon->ses->server->request_q); +	cifs_add_credits(tcon->ses->server, 1);  }  /* cifs_async_writev - send an async write, and set up mid to handle result */ @@ -2649,16 +2651,16 @@ renameRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName, -				     PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName, +				       PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  		pSMB->OldFileName[name_len] = 0x04;	/* pad */  	/* protocol requires ASCII signature byte on Unicode string */  		pSMB->OldFileName[name_len + 1] = 0x00;  		name_len2 = -		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], -				     toName, PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], +				       toName, PATH_MAX, nls_codepage, remap);  		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;  		name_len2 *= 2;	/* convert to bytes */  	} else {	/* BB improve the check for buffer overruns BB */ @@ -2738,10 +2740,12 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,  	/* unicode only call */  	if (target_name == NULL) {  		sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid); -		len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name, +		len_of_str = +			cifsConvertToUTF16((__le16 *)rename_info->target_name,  					dummy_string, 24, nls_codepage, remap);  	} else { -		len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name, +		len_of_str = +			cifsConvertToUTF16((__le16 *)rename_info->target_name,  					target_name, PATH_MAX, nls_codepage,  					remap);  	} @@ -2795,17 +2799,17 @@ copyRetry:  	pSMB->Flags = cpu_to_le16(flags & COPY_TREE);  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { -		name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName, -					    fromName, PATH_MAX, nls_codepage, -					    remap); +		name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName, +					      fromName, PATH_MAX, nls_codepage, +					      remap);  		name_len++;     /* trailing null */  		name_len *= 2;  		pSMB->OldFileName[name_len] = 0x04;     /* pad */  		/* protocol requires ASCII signature byte on Unicode string */  		pSMB->OldFileName[name_len + 1] = 0x00;  		name_len2 = -		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], -				toName, PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], +				       toName, PATH_MAX, nls_codepage, remap);  		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;  		name_len2 *= 2; /* convert to bytes */  	} else { 	/* BB improve the check for buffer overruns BB */ @@ -2861,9 +2865,9 @@ createSymLinkRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX -				  /* find define for this maxpathcomponent */ -				  , nls_codepage); +		    cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName, +				    /* find define for this maxpathcomponent */ +				    PATH_MAX, nls_codepage);  		name_len++;	/* trailing null */  		name_len *= 2; @@ -2885,9 +2889,9 @@ createSymLinkRetry:  	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len_target = -		    cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX -				  /* find define for this maxpathcomponent */ -				  , nls_codepage); +		    cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX +				    /* find define for this maxpathcomponent */ +				    , nls_codepage);  		name_len_target++;	/* trailing null */  		name_len_target *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -2949,8 +2953,8 @@ createHardLinkRetry:  		return rc;  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { -		name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName, -					    PATH_MAX, nls_codepage, remap); +		name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName, +					      PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2; @@ -2972,8 +2976,8 @@ createHardLinkRetry:  	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len_target = -		    cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX, -				     nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) data_offset, fromName, +				       PATH_MAX, nls_codepage, remap);  		name_len_target++;	/* trailing null */  		name_len_target *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -3042,8 +3046,8 @@ winCreateHardLinkRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName, -				     PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName, +				       PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2; @@ -3051,8 +3055,8 @@ winCreateHardLinkRetry:  		pSMB->OldFileName[name_len] = 0x04;  		pSMB->OldFileName[name_len + 1] = 0x00; /* pad */  		name_len2 = -		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], -				     toName, PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2], +				       toName, PATH_MAX, nls_codepage, remap);  		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;  		name_len2 *= 2;	/* convert to bytes */  	} else {	/* BB improve the check for buffer overruns BB */ @@ -3108,8 +3112,8 @@ querySymLinkRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifs_strtoUCS((__le16 *) pSMB->FileName, searchName, -				  PATH_MAX, nls_codepage); +			cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName, +					PATH_MAX, nls_codepage);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -3166,8 +3170,8 @@ querySymLinkRetry:  				is_unicode = false;  			/* BB FIXME investigate remapping reserved chars here */ -			*symlinkinfo = cifs_strndup_from_ucs(data_start, count, -						    is_unicode, nls_codepage); +			*symlinkinfo = cifs_strndup_from_utf16(data_start, +					count, is_unicode, nls_codepage);  			if (!*symlinkinfo)  				rc = -ENOMEM;  		} @@ -3450,8 +3454,9 @@ queryAclRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, -					 PATH_MAX, nls_codepage, remap); +			cifsConvertToUTF16((__le16 *) pSMB->FileName, +					   searchName, PATH_MAX, nls_codepage, +					   remap);  		name_len++;     /* trailing null */  		name_len *= 2;  		pSMB->FileName[name_len] = 0; @@ -3537,8 +3542,8 @@ setAclRetry:  		return rc;  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -			cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, -				      PATH_MAX, nls_codepage, remap); +			cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, +					   PATH_MAX, nls_codepage, remap);  		name_len++;     /* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -3948,8 +3953,9 @@ QInfRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, -					PATH_MAX, nls_codepage, remap); +			cifsConvertToUTF16((__le16 *) pSMB->FileName, +					   searchName, PATH_MAX, nls_codepage, +					   remap);  		name_len++;     /* trailing null */  		name_len *= 2;  	} else { @@ -4086,8 +4092,8 @@ QPathInfoRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, -				     PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, +				       PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -4255,8 +4261,8 @@ UnixQPathInfoRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, -				  PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, +				       PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -4344,8 +4350,8 @@ findFirstRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, -				 PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, +				       PATH_MAX, nls_codepage, remap);  		/* We can not add the asterik earlier in case  		it got remapped to 0xF03A as if it were part of the  		directory name instead of a wildcard */ @@ -4656,8 +4662,9 @@ GetInodeNumberRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -			cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, -					 PATH_MAX, nls_codepage, remap); +			cifsConvertToUTF16((__le16 *) pSMB->FileName, +					   searchName, PATH_MAX, nls_codepage, +					   remap);  		name_len++;     /* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -4794,9 +4801,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,  				rc = -ENOMEM;  				goto parse_DFS_referrals_exit;  			} -			cifsConvertToUCS((__le16 *) tmp, searchName, -					PATH_MAX, nls_codepage, remap); -			node->path_consumed = cifs_ucs2_bytes(tmp, +			cifsConvertToUTF16((__le16 *) tmp, searchName, +					   PATH_MAX, nls_codepage, remap); +			node->path_consumed = cifs_utf16_bytes(tmp,  					le16_to_cpu(pSMBr->PathConsumed),  					nls_codepage);  			kfree(tmp); @@ -4809,8 +4816,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,  		/* copy DfsPath */  		temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);  		max_len = data_end - temp; -		node->path_name = cifs_strndup_from_ucs(temp, max_len, -						      is_unicode, nls_codepage); +		node->path_name = cifs_strndup_from_utf16(temp, max_len, +						is_unicode, nls_codepage);  		if (!node->path_name) {  			rc = -ENOMEM;  			goto parse_DFS_referrals_exit; @@ -4819,8 +4826,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,  		/* copy link target UNC */  		temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);  		max_len = data_end - temp; -		node->node_name = cifs_strndup_from_ucs(temp, max_len, -						      is_unicode, nls_codepage); +		node->node_name = cifs_strndup_from_utf16(temp, max_len, +						is_unicode, nls_codepage);  		if (!node->node_name)  			rc = -ENOMEM;  	} @@ -4873,8 +4880,9 @@ getDFSRetry:  	if (ses->capabilities & CAP_UNICODE) {  		pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->RequestFileName, -				     searchName, PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->RequestFileName, +				       searchName, PATH_MAX, nls_codepage, +				       remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -5506,8 +5514,8 @@ SetEOFRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, -				     PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, +				       PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -5796,8 +5804,8 @@ SetTimesRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, -				     PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, +				       PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -5877,8 +5885,8 @@ SetAttrLgcyRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -			ConvertToUCS((__le16 *) pSMB->fileName, fileName, -				PATH_MAX, nls_codepage); +			ConvertToUTF16((__le16 *) pSMB->fileName, fileName, +				       PATH_MAX, nls_codepage);  		name_len++;     /* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -6030,8 +6038,8 @@ setPermsRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, -				     PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, +				       PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -6123,8 +6131,8 @@ QAllEAsRetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		list_len = -		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, -				     PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName, +				       PATH_MAX, nls_codepage, remap);  		list_len++;	/* trailing null */  		list_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ @@ -6301,8 +6309,8 @@ SetEARetry:  	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {  		name_len = -		    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, -				     PATH_MAX, nls_codepage, remap); +		    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName, +				       PATH_MAX, nls_codepage, remap);  		name_len++;	/* trailing null */  		name_len *= 2;  	} else {	/* BB improve the check for buffer overruns BB */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4666780f315..5560e1d5e54 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -38,6 +38,7 @@  #include <asm/processor.h>  #include <linux/inet.h>  #include <linux/module.h> +#include <keys/user-type.h>  #include <net/ipv6.h>  #include "cifspdu.h"  #include "cifsglob.h" @@ -225,74 +226,90 @@ static int check2ndT2(struct smb_hdr *pSMB)  static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)  { -	struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond; +	struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)psecond;  	struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)pTargetSMB; -	char *data_area_of_target; -	char *data_area_of_buf2; +	char *data_area_of_tgt; +	char *data_area_of_src;  	int remaining; -	unsigned int byte_count, total_in_buf; -	__u16 total_data_size, total_in_buf2; +	unsigned int byte_count, total_in_tgt; +	__u16 tgt_total_cnt, src_total_cnt, total_in_src; -	total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); +	src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount); +	tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); -	if (total_data_size != -	    get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount)) -		cFYI(1, "total data size of primary and secondary t2 differ"); +	if (tgt_total_cnt != src_total_cnt) +		cFYI(1, "total data count of primary and secondary t2 differ " +			"source=%hu target=%hu", src_total_cnt, tgt_total_cnt); -	total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); +	total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); -	remaining = total_data_size - total_in_buf; +	remaining = tgt_total_cnt - total_in_tgt; -	if (remaining < 0) +	if (remaining < 0) { +		cFYI(1, "Server sent too much data. tgt_total_cnt=%hu " +			"total_in_tgt=%hu", tgt_total_cnt, total_in_tgt);  		return -EPROTO; +	} -	if (remaining == 0) /* nothing to do, ignore */ +	if (remaining == 0) { +		/* nothing to do, ignore */ +		cFYI(1, "no more data remains");  		return 0; +	} -	total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount); -	if (remaining < total_in_buf2) { +	total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount); +	if (remaining < total_in_src)  		cFYI(1, "transact2 2nd response contains too much data"); -	}  	/* find end of first SMB data area */ -	data_area_of_target = (char *)&pSMBt->hdr.Protocol + +	data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +  				get_unaligned_le16(&pSMBt->t2_rsp.DataOffset); -	/* validate target area */ -	data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol + -				get_unaligned_le16(&pSMB2->t2_rsp.DataOffset); +	/* validate target area */ +	data_area_of_src = (char *)&pSMBs->hdr.Protocol + +				get_unaligned_le16(&pSMBs->t2_rsp.DataOffset); -	data_area_of_target += total_in_buf; +	data_area_of_tgt += total_in_tgt; -	/* copy second buffer into end of first buffer */ -	total_in_buf += total_in_buf2; +	total_in_tgt += total_in_src;  	/* is the result too big for the field? */ -	if (total_in_buf > USHRT_MAX) +	if (total_in_tgt > USHRT_MAX) { +		cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt);  		return -EPROTO; -	put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount); +	} +	put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);  	/* fix up the BCC */  	byte_count = get_bcc(pTargetSMB); -	byte_count += total_in_buf2; +	byte_count += total_in_src;  	/* is the result too big for the field? */ -	if (byte_count > USHRT_MAX) +	if (byte_count > USHRT_MAX) { +		cFYI(1, "coalesced BCC too large (%u)", byte_count);  		return -EPROTO; +	}  	put_bcc(byte_count, pTargetSMB);  	byte_count = be32_to_cpu(pTargetSMB->smb_buf_length); -	byte_count += total_in_buf2; +	byte_count += total_in_src;  	/* don't allow buffer to overflow */ -	if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) +	if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { +		cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count);  		return -ENOBUFS; +	}  	pTargetSMB->smb_buf_length = cpu_to_be32(byte_count); -	memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); +	/* copy second buffer into end of first buffer */ +	memcpy(data_area_of_tgt, data_area_of_src, total_in_src); -	if (remaining == total_in_buf2) { -		cFYI(1, "found the last secondary response"); -		return 0; /* we are done */ -	} else /* more responses to go */ +	if (remaining != total_in_src) { +		/* more responses to go */ +		cFYI(1, "waiting for more secondary responses");  		return 1; +	} + +	/* we are done */ +	cFYI(1, "found the last secondary response"); +	return 0;  }  static void @@ -356,12 +373,22 @@ allocate_buffers(struct TCP_Server_Info *server)  static bool  server_unresponsive(struct TCP_Server_Info *server)  { -	if (echo_retries > 0 && server->tcpStatus == CifsGood && -	    time_after(jiffies, server->lstrp + -				(echo_retries * SMB_ECHO_INTERVAL))) { +	/* +	 * We need to wait 2 echo intervals to make sure we handle such +	 * situations right: +	 * 1s  client sends a normal SMB request +	 * 2s  client gets a response +	 * 30s echo workqueue job pops, and decides we got a response recently +	 *     and don't need to send another +	 * ... +	 * 65s kernel_recvmsg times out, and we see that we haven't gotten +	 *     a response in >60s. +	 */ +	if (server->tcpStatus == CifsGood && +	    time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {  		cERROR(1, "Server %s has not responded in %d seconds. "  			  "Reconnecting...", server->hostname, -			  (echo_retries * SMB_ECHO_INTERVAL / HZ)); +			  (2 * SMB_ECHO_INTERVAL) / HZ);  		cifs_reconnect(server);  		wake_up(&server->response_q);  		return true; @@ -625,19 +652,11 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)  	spin_unlock(&GlobalMid_Lock);  	wake_up_all(&server->response_q); -	/* -	 * Check if we have blocked requests that need to free. Note that -	 * cifs_max_pending is normally 50, but can be set at module install -	 * time to as little as two. -	 */ -	spin_lock(&GlobalMid_Lock); -	if (atomic_read(&server->inFlight) >= cifs_max_pending) -		atomic_set(&server->inFlight, cifs_max_pending - 1); -	/* -	 * We do not want to set the max_pending too low or we could end up -	 * with the counter going negative. -	 */ -	spin_unlock(&GlobalMid_Lock); +	/* check if we have blocked requests that need to free */ +	spin_lock(&server->req_lock); +	if (server->credits <= 0) +		server->credits = 1; +	spin_unlock(&server->req_lock);  	/*  	 * Although there should not be any requests blocked on this queue it  	 * can not hurt to be paranoid and try to wake up requests that may @@ -756,10 +775,11 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)  		cifs_dump_mem("Bad SMB: ", buf,  			min_t(unsigned int, server->total_read, 48)); -	if (mid) -		handle_mid(mid, server, smb_buffer, length); +	if (!mid) +		return length; -	return length; +	handle_mid(mid, server, smb_buffer, length); +	return 0;  }  static int @@ -1578,11 +1598,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,  		}  	} -	if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) { -		cERROR(1, "Multiuser mounts currently require krb5 " -			  "authentication!"); +#ifndef CONFIG_KEYS +	/* Muliuser mounts require CONFIG_KEYS support */ +	if (vol->multiuser) { +		cERROR(1, "Multiuser mounts require kernels with " +			  "CONFIG_KEYS enabled.");  		goto cifs_parse_mount_err;  	} +#endif  	if (vol->UNCip == NULL)  		vol->UNCip = &vol->UNC[2]; @@ -1888,7 +1911,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)  	tcp_ses->noblocksnd = volume_info->noblocksnd;  	tcp_ses->noautotune = volume_info->noautotune;  	tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay; -	atomic_set(&tcp_ses->inFlight, 0); +	tcp_ses->in_flight = 0; +	tcp_ses->credits = 1;  	init_waitqueue_head(&tcp_ses->response_q);  	init_waitqueue_head(&tcp_ses->request_q);  	INIT_LIST_HEAD(&tcp_ses->pending_mid_q); @@ -1981,10 +2005,16 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)  			return 0;  		break;  	default: +		/* NULL username means anonymous session */ +		if (ses->user_name == NULL) { +			if (!vol->nullauth) +				return 0; +			break; +		} +  		/* anything else takes username/password */ -		if (ses->user_name == NULL) -			return 0; -		if (strncmp(ses->user_name, vol->username, +		if (strncmp(ses->user_name, +			    vol->username ? vol->username : "",  			    MAX_USERNAME_SIZE))  			return 0;  		if (strlen(vol->username) != 0 && @@ -2039,6 +2069,132 @@ cifs_put_smb_ses(struct cifs_ses *ses)  	cifs_put_tcp_session(server);  } +#ifdef CONFIG_KEYS + +/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */ +#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1) + +/* Populate username and pw fields from keyring if possible */ +static int +cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) +{ +	int rc = 0; +	char *desc, *delim, *payload; +	ssize_t len; +	struct key *key; +	struct TCP_Server_Info *server = ses->server; +	struct sockaddr_in *sa; +	struct sockaddr_in6 *sa6; +	struct user_key_payload *upayload; + +	desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL); +	if (!desc) +		return -ENOMEM; + +	/* try to find an address key first */ +	switch (server->dstaddr.ss_family) { +	case AF_INET: +		sa = (struct sockaddr_in *)&server->dstaddr; +		sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr); +		break; +	case AF_INET6: +		sa6 = (struct sockaddr_in6 *)&server->dstaddr; +		sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr); +		break; +	default: +		cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family); +		rc = -EINVAL; +		goto out_err; +	} + +	cFYI(1, "%s: desc=%s", __func__, desc); +	key = request_key(&key_type_logon, desc, ""); +	if (IS_ERR(key)) { +		if (!ses->domainName) { +			cFYI(1, "domainName is NULL"); +			rc = PTR_ERR(key); +			goto out_err; +		} + +		/* didn't work, try to find a domain key */ +		sprintf(desc, "cifs:d:%s", ses->domainName); +		cFYI(1, "%s: desc=%s", __func__, desc); +		key = request_key(&key_type_logon, desc, ""); +		if (IS_ERR(key)) { +			rc = PTR_ERR(key); +			goto out_err; +		} +	} + +	down_read(&key->sem); +	upayload = key->payload.data; +	if (IS_ERR_OR_NULL(upayload)) { +		rc = upayload ? PTR_ERR(upayload) : -EINVAL; +		goto out_key_put; +	} + +	/* find first : in payload */ +	payload = (char *)upayload->data; +	delim = strnchr(payload, upayload->datalen, ':'); +	cFYI(1, "payload=%s", payload); +	if (!delim) { +		cFYI(1, "Unable to find ':' in payload (datalen=%d)", +				upayload->datalen); +		rc = -EINVAL; +		goto out_key_put; +	} + +	len = delim - payload; +	if (len > MAX_USERNAME_SIZE || len <= 0) { +		cFYI(1, "Bad value from username search (len=%zd)", len); +		rc = -EINVAL; +		goto out_key_put; +	} + +	vol->username = kstrndup(payload, len, GFP_KERNEL); +	if (!vol->username) { +		cFYI(1, "Unable to allocate %zd bytes for username", len); +		rc = -ENOMEM; +		goto out_key_put; +	} +	cFYI(1, "%s: username=%s", __func__, vol->username); + +	len = key->datalen - (len + 1); +	if (len > MAX_PASSWORD_SIZE || len <= 0) { +		cFYI(1, "Bad len for password search (len=%zd)", len); +		rc = -EINVAL; +		kfree(vol->username); +		vol->username = NULL; +		goto out_key_put; +	} + +	++delim; +	vol->password = kstrndup(delim, len, GFP_KERNEL); +	if (!vol->password) { +		cFYI(1, "Unable to allocate %zd bytes for password", len); +		rc = -ENOMEM; +		kfree(vol->username); +		vol->username = NULL; +		goto out_key_put; +	} + +out_key_put: +	up_read(&key->sem); +	key_put(key); +out_err: +	kfree(desc); +	cFYI(1, "%s: returning %d", __func__, rc); +	return rc; +} +#else /* ! CONFIG_KEYS */ +static inline int +cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)), +		   struct cifs_ses *ses __attribute__((unused))) +{ +	return -ENOSYS; +} +#endif /* CONFIG_KEYS */ +  static bool warned_on_ntlm;  /* globals init to false automatically */  static struct cifs_ses * @@ -2914,18 +3070,33 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,  #define CIFS_DEFAULT_IOSIZE (1024 * 1024)  /* - * Windows only supports a max of 60k reads. Default to that when posix - * extensions aren't in force. + * Windows only supports a max of 60kb reads and 65535 byte writes. Default to + * those values when posix extensions aren't in force. In actuality here, we + * use 65536 to allow for a write that is a multiple of 4k. Most servers seem + * to be ok with the extra byte even though Windows doesn't send writes that + * are that large. + * + * Citation: + * + * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx   */  #define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) +#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)  static unsigned int  cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)  {  	__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);  	struct TCP_Server_Info *server = tcon->ses->server; -	unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize : -				CIFS_DEFAULT_IOSIZE; +	unsigned int wsize; + +	/* start with specified wsize, or default */ +	if (pvolume_info->wsize) +		wsize = pvolume_info->wsize; +	else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) +		wsize = CIFS_DEFAULT_IOSIZE; +	else +		wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;  	/* can server support 24-bit write sizes? (via UNIX extensions) */  	if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) @@ -3136,10 +3307,9 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,  		return -EINVAL;  	if (volume_info->nullauth) { -		cFYI(1, "null user"); -		volume_info->username = kzalloc(1, GFP_KERNEL); -		if (volume_info->username == NULL) -			return -ENOMEM; +		cFYI(1, "Anonymous login"); +		kfree(volume_info->username); +		volume_info->username = NULL;  	} else if (volume_info->username) {  		/* BB fixme parse for domain name here */  		cFYI(1, "Username: %s", volume_info->username); @@ -3204,7 +3374,7 @@ cifs_ra_pages(struct cifs_sb_info *cifs_sb)  int  cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)  { -	int rc = 0; +	int rc;  	int xid;  	struct cifs_ses *pSesInfo;  	struct cifs_tcon *tcon; @@ -3231,6 +3401,7 @@ try_mount_again:  		FreeXid(xid);  	}  #endif +	rc = 0;  	tcon = NULL;  	pSesInfo = NULL;  	srvTcp = NULL; @@ -3478,7 +3649,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,  	if (ses->capabilities & CAP_UNICODE) {  		smb_buffer->Flags2 |= SMBFLG2_UNICODE;  		length = -		    cifs_strtoUCS((__le16 *) bcc_ptr, tree, +		    cifs_strtoUTF16((__le16 *) bcc_ptr, tree,  			6 /* max utf8 char length in bytes */ *  			(/* server len*/ + 256 /* share len */), nls_codepage);  		bcc_ptr += 2 * length;	/* convert num 16 bit words to bytes */ @@ -3533,7 +3704,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,  		/* mostly informational -- no need to fail on error here */  		kfree(tcon->nativeFileSystem); -		tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr, +		tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr,  						      bytes_left, is_unicode,  						      nls_codepage); @@ -3592,9 +3763,11 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses)  	if (server->maxBuf != 0)  		return 0; +	cifs_set_credits(server, 1);  	rc = CIFSSMBNegotiate(xid, ses);  	if (rc == -EAGAIN) {  		/* retry only once on 1st time connection */ +		cifs_set_credits(server, 1);  		rc = CIFSSMBNegotiate(xid, ses);  		if (rc == -EAGAIN)  			rc = -EHOSTDOWN; @@ -3657,25 +3830,43 @@ int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,  	return rc;  } +static int +cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses) +{ +	switch (ses->server->secType) { +	case Kerberos: +		vol->secFlg = CIFSSEC_MUST_KRB5; +		return 0; +	case NTLMv2: +		vol->secFlg = CIFSSEC_MUST_NTLMV2; +		break; +	case NTLM: +		vol->secFlg = CIFSSEC_MUST_NTLM; +		break; +	case RawNTLMSSP: +		vol->secFlg = CIFSSEC_MUST_NTLMSSP; +		break; +	case LANMAN: +		vol->secFlg = CIFSSEC_MUST_LANMAN; +		break; +	} + +	return cifs_set_cifscreds(vol, ses); +} +  static struct cifs_tcon *  cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)  { +	int rc;  	struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);  	struct cifs_ses *ses;  	struct cifs_tcon *tcon = NULL;  	struct smb_vol *vol_info; -	char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */ -			   /* We used to have this as MAX_USERNAME which is   */ -			   /* way too big now (256 instead of 32) */  	vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL); -	if (vol_info == NULL) { -		tcon = ERR_PTR(-ENOMEM); -		goto out; -	} +	if (vol_info == NULL) +		return ERR_PTR(-ENOMEM); -	snprintf(username, sizeof(username), "krb50x%x", fsuid); -	vol_info->username = username;  	vol_info->local_nls = cifs_sb->local_nls;  	vol_info->linux_uid = fsuid;  	vol_info->cred_uid = fsuid; @@ -3685,8 +3876,11 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)  	vol_info->local_lease = master_tcon->local_lease;  	vol_info->no_linux_ext = !master_tcon->unix_ext; -	/* FIXME: allow for other secFlg settings */ -	vol_info->secFlg = CIFSSEC_MUST_KRB5; +	rc = cifs_set_vol_auth(vol_info, master_tcon->ses); +	if (rc) { +		tcon = ERR_PTR(rc); +		goto out; +	}  	/* get a reference for the same TCP session */  	spin_lock(&cifs_tcp_ses_lock); @@ -3709,6 +3903,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)  	if (ses->capabilities & CAP_UNIX)  		reset_cifs_unix_caps(0, tcon, NULL, vol_info);  out: +	kfree(vol_info->username); +	kfree(vol_info->password);  	kfree(vol_info);  	return tcon; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index df8fecb5b99..d172c8ed901 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -171,7 +171,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,  	}  	tcon = tlink_tcon(tlink); -	if (enable_oplocks) +	if (tcon->ses->server->oplocks)  		oplock = REQ_OPLOCK;  	if (nd) @@ -492,7 +492,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,  {  	int xid;  	int rc = 0; /* to get around spurious gcc warning, set to zero here */ -	__u32 oplock = 0; +	__u32 oplock;  	__u16 fileHandle = 0;  	bool posix_open = false;  	struct cifs_sb_info *cifs_sb; @@ -518,6 +518,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,  	}  	pTcon = tlink_tcon(tlink); +	oplock = pTcon->ses->server->oplocks ? REQ_OPLOCK : 0; +  	/*  	 * Don't allow the separator character in a path component.  	 * The VFS will not allow "/", but "\" is allowed by posix. @@ -584,10 +586,26 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,  			 * If either that or op not supported returned, follow  			 * the normal lookup.  			 */ -			if ((rc == 0) || (rc == -ENOENT)) +			switch (rc) { +			case 0: +				/* +				 * The server may allow us to open things like +				 * FIFOs, but the client isn't set up to deal +				 * with that. If it's not a regular file, just +				 * close it and proceed as if it were a normal +				 * lookup. +				 */ +				if (newInode && !S_ISREG(newInode->i_mode)) { +					CIFSSMBClose(xid, pTcon, fileHandle); +					break; +				} +			case -ENOENT:  				posix_open = true; -			else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP)) +			case -EOPNOTSUPP: +				break; +			default:  				pTcon->broken_posix_open = true; +			}  		}  		if (!posix_open)  			rc = cifs_get_inode_info_unix(&newInode, full_path, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4dd9283885e..159fcc56dc2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -380,7 +380,7 @@ int cifs_open(struct inode *inode, struct file *file)  	cFYI(1, "inode = 0x%p file flags are 0x%x for %s",  		 inode, file->f_flags, full_path); -	if (enable_oplocks) +	if (tcon->ses->server->oplocks)  		oplock = REQ_OPLOCK;  	else  		oplock = 0; @@ -505,7 +505,7 @@ static int cifs_reopen_file(struct cifsFileInfo *pCifsFile, bool can_flush)  	cFYI(1, "inode = 0x%p file flags 0x%x for %s",  		 inode, pCifsFile->f_flags, full_path); -	if (enable_oplocks) +	if (tcon->ses->server->oplocks)  		oplock = REQ_OPLOCK;  	else  		oplock = 0; @@ -920,16 +920,26 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)  	for (lockp = &inode->i_flock; *lockp != NULL; \  	     lockp = &(*lockp)->fl_next) +struct lock_to_push { +	struct list_head llist; +	__u64 offset; +	__u64 length; +	__u32 pid; +	__u16 netfid; +	__u8 type; +}; +  static int  cifs_push_posix_locks(struct cifsFileInfo *cfile)  {  	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);  	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);  	struct file_lock *flock, **before; -	struct cifsLockInfo *lck, *tmp; +	unsigned int count = 0, i = 0;  	int rc = 0, xid, type; +	struct list_head locks_to_send, *el; +	struct lock_to_push *lck, *tmp;  	__u64 length; -	struct list_head locks_to_send;  	xid = GetXid(); @@ -940,29 +950,56 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)  		return rc;  	} +	lock_flocks(); +	cifs_for_each_lock(cfile->dentry->d_inode, before) { +		if ((*before)->fl_flags & FL_POSIX) +			count++; +	} +	unlock_flocks(); +  	INIT_LIST_HEAD(&locks_to_send); +	/* +	 * Allocating count locks is enough because no FL_POSIX locks can be +	 * added to the list while we are holding cinode->lock_mutex that +	 * protects locking operations of this inode. +	 */ +	for (; i < count; i++) { +		lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL); +		if (!lck) { +			rc = -ENOMEM; +			goto err_out; +		} +		list_add_tail(&lck->llist, &locks_to_send); +	} + +	el = locks_to_send.next;  	lock_flocks();  	cifs_for_each_lock(cfile->dentry->d_inode, before) {  		flock = *before; +		if ((flock->fl_flags & FL_POSIX) == 0) +			continue; +		if (el == &locks_to_send) { +			/* +			 * The list ended. We don't have enough allocated +			 * structures - something is really wrong. +			 */ +			cERROR(1, "Can't push all brlocks!"); +			break; +		}  		length = 1 + flock->fl_end - flock->fl_start;  		if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)  			type = CIFS_RDLCK;  		else  			type = CIFS_WRLCK; - -		lck = cifs_lock_init(flock->fl_start, length, type, -				     cfile->netfid); -		if (!lck) { -			rc = -ENOMEM; -			goto send_locks; -		} +		lck = list_entry(el, struct lock_to_push, llist);  		lck->pid = flock->fl_pid; - -		list_add_tail(&lck->llist, &locks_to_send); +		lck->netfid = cfile->netfid; +		lck->length = length; +		lck->type = type; +		lck->offset = flock->fl_start; +		el = el->next;  	} - -send_locks:  	unlock_flocks();  	list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { @@ -979,11 +1016,18 @@ send_locks:  		kfree(lck);  	} +out:  	cinode->can_cache_brlcks = false;  	mutex_unlock(&cinode->lock_mutex);  	FreeXid(xid);  	return rc; +err_out: +	list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { +		list_del(&lck->llist); +		kfree(lck); +	} +	goto out;  }  static int diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a5f54b7d982..745da3d0653 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -534,6 +534,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,  	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {  		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;  		fattr->cf_dtype = DT_DIR; +		/* +		 * Server can return wrong NumberOfLinks value for directories +		 * when Unix extensions are disabled - fake it. +		 */ +		fattr->cf_nlink = 2;  	} else {  		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;  		fattr->cf_dtype = DT_REG; @@ -541,9 +546,9 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,  		/* clear write bits if ATTR_READONLY is set */  		if (fattr->cf_cifsattrs & ATTR_READONLY)  			fattr->cf_mode &= ~(S_IWUGO); -	} -	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); +		fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); +	}  	fattr->cf_uid = cifs_sb->mnt_uid;  	fattr->cf_gid = cifs_sb->mnt_gid; @@ -1322,7 +1327,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)  			}  /*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need  	to set uid/gid */ -			inc_nlink(inode);  			cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);  			cifs_fill_uniqueid(inode->i_sb, &fattr); @@ -1355,7 +1359,6 @@ mkdir_retry_old:  		d_drop(direntry);  	} else {  mkdir_get_info: -		inc_nlink(inode);  		if (pTcon->unix_ext)  			rc = cifs_get_inode_info_unix(&newinode, full_path,  						      inode->i_sb, xid); @@ -1436,6 +1439,11 @@ mkdir_get_info:  		}  	}  mkdir_out: +	/* +	 * Force revalidate to get parent dir info when needed since cached +	 * attributes are invalid now. +	 */ +	CIFS_I(inode)->time = 0;  	kfree(full_path);  	FreeXid(xid);  	cifs_put_tlink(tlink); @@ -1475,7 +1483,6 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)  	cifs_put_tlink(tlink);  	if (!rc) { -		drop_nlink(inode);  		spin_lock(&direntry->d_inode->i_lock);  		i_size_write(direntry->d_inode, 0);  		clear_nlink(direntry->d_inode); @@ -1483,12 +1490,15 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)  	}  	cifsInode = CIFS_I(direntry->d_inode); -	cifsInode->time = 0;	/* force revalidate to go get info when -				   needed */ +	/* force revalidate to go get info when needed */ +	cifsInode->time = 0;  	cifsInode = CIFS_I(inode); -	cifsInode->time = 0;	/* force revalidate to get parent dir info -				   since cached search results now invalid */ +	/* +	 * Force revalidate to get parent dir info when needed since cached +	 * attributes are invalid now. +	 */ +	cifsInode->time = 0;  	direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =  		current_fs_time(inode->i_sb); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 703ef5c6fdb..c273c12de98 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -690,3 +690,22 @@ backup_cred(struct cifs_sb_info *cifs_sb)  	return false;  } + +void +cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add) +{ +	spin_lock(&server->req_lock); +	server->credits += add; +	server->in_flight--; +	spin_unlock(&server->req_lock); +	wake_up(&server->request_q); +} + +void +cifs_set_credits(struct TCP_Server_Info *server, const int val) +{ +	spin_lock(&server->req_lock); +	server->credits = val; +	server->oplocks = val > 1 ? enable_oplocks : false; +	spin_unlock(&server->req_lock); +} diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index a090bbe6ee2..e2bbc683e01 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -647,10 +647,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,  		name.name = scratch_buf;  		name.len = -			cifs_from_ucs2((char *)name.name, (__le16 *)de.name, -				       UNICODE_NAME_MAX, -				       min(de.namelen, (size_t)max_len), nlt, -				       cifs_sb->mnt_cifs_flags & +			cifs_from_utf16((char *)name.name, (__le16 *)de.name, +					UNICODE_NAME_MAX, +					min_t(size_t, de.namelen, +					      (size_t)max_len), nlt, +					cifs_sb->mnt_cifs_flags &  						CIFS_MOUNT_MAP_SPECIAL_CHR);  		name.len -= nls_nullsize(nlt);  	} else { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 4ec3ee9d72c..551d0c2b973 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -167,16 +167,16 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)  	int bytes_ret = 0;  	/* Copy OS version */ -	bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32, -				  nls_cp); +	bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32, +				    nls_cp);  	bcc_ptr += 2 * bytes_ret; -	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release, -				  32, nls_cp); +	bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release, +				    32, nls_cp);  	bcc_ptr += 2 * bytes_ret;  	bcc_ptr += 2; /* trailing null */ -	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS, -				  32, nls_cp); +	bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS, +				    32, nls_cp);  	bcc_ptr += 2 * bytes_ret;  	bcc_ptr += 2; /* trailing null */ @@ -197,8 +197,8 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,  		*(bcc_ptr+1) = 0;  		bytes_ret = 0;  	} else -		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName, -					  256, nls_cp); +		bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName, +					    256, nls_cp);  	bcc_ptr += 2 * bytes_ret;  	bcc_ptr += 2;  /* account for null terminator */ @@ -226,8 +226,8 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,  		*bcc_ptr = 0;  		*(bcc_ptr+1) = 0;  	} else { -		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name, -					  MAX_USERNAME_SIZE, nls_cp); +		bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name, +					    MAX_USERNAME_SIZE, nls_cp);  	}  	bcc_ptr += 2 * bytes_ret;  	bcc_ptr += 2; /* account for null termination */ @@ -246,16 +246,15 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,  	/* copy user */  	/* BB what about null user mounts - check that we do this BB */  	/* copy user */ -	if (ses->user_name != NULL) +	if (ses->user_name != NULL) {  		strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE); +		bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); +	}  	/* else null user mount */ - -	bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);  	*bcc_ptr = 0;  	bcc_ptr++; /* account for null termination */  	/* copy domain */ -  	if (ses->domainName != NULL) {  		strncpy(bcc_ptr, ses->domainName, 256);  		bcc_ptr += strnlen(ses->domainName, 256); @@ -287,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,  	cFYI(1, "bleft %d", bleft);  	kfree(ses->serverOS); -	ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); +	ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);  	cFYI(1, "serverOS=%s", ses->serverOS);  	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;  	data += len; @@ -296,7 +295,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,  		return;  	kfree(ses->serverNOS); -	ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); +	ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);  	cFYI(1, "serverNOS=%s", ses->serverNOS);  	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;  	data += len; @@ -305,7 +304,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,  		return;  	kfree(ses->serverDomain); -	ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp); +	ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);  	cFYI(1, "serverDomain=%s", ses->serverDomain);  	return; @@ -395,6 +394,10 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,  	ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);  	tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);  	tilen = le16_to_cpu(pblob->TargetInfoArray.Length); +	if (tioffset > blob_len || tioffset + tilen > blob_len) { +		cERROR(1, "tioffset + tilen too high %u + %u", tioffset, tilen); +		return -EINVAL; +	}  	if (tilen) {  		ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);  		if (!ses->auth_key.response) { @@ -502,8 +505,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,  		tmp += 2;  	} else {  		int len; -		len = cifs_strtoUCS((__le16 *)tmp, ses->domainName, -				    MAX_USERNAME_SIZE, nls_cp); +		len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName, +				      MAX_USERNAME_SIZE, nls_cp);  		len *= 2; /* unicode is 2 bytes each */  		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);  		sec_blob->DomainName.Length = cpu_to_le16(len); @@ -518,8 +521,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,  		tmp += 2;  	} else {  		int len; -		len = cifs_strtoUCS((__le16 *)tmp, ses->user_name, -				    MAX_USERNAME_SIZE, nls_cp); +		len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name, +				      MAX_USERNAME_SIZE, nls_cp);  		len *= 2; /* unicode is 2 bytes each */  		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);  		sec_blob->UserName.Length = cpu_to_le16(len); diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 80d85088193..d5cd9aa7eac 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -213,7 +213,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,  	/* Password cannot be longer than 128 characters */  	if (passwd) /* Password must be converted to NT unicode */ -		len = cifs_strtoUCS(wpwd, passwd, 128, codepage); +		len = cifs_strtoUTF16(wpwd, passwd, 128, codepage);  	else {  		len = 0;  		*wpwd = 0; /* Ensure string is null terminated */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 0cc9584f588..310918b6fcb 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -254,44 +254,60 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,  	return smb_sendv(server, &iov, 1);  } -static int wait_for_free_request(struct TCP_Server_Info *server, -				 const int long_op) +static int +wait_for_free_credits(struct TCP_Server_Info *server, const int optype, +		      int *credits)  { -	if (long_op == CIFS_ASYNC_OP) { +	int rc; + +	spin_lock(&server->req_lock); +	if (optype == CIFS_ASYNC_OP) {  		/* oplock breaks must not be held up */ -		atomic_inc(&server->inFlight); +		server->in_flight++; +		*credits -= 1; +		spin_unlock(&server->req_lock);  		return 0;  	} -	spin_lock(&GlobalMid_Lock);  	while (1) { -		if (atomic_read(&server->inFlight) >= cifs_max_pending) { -			spin_unlock(&GlobalMid_Lock); +		if (*credits <= 0) { +			spin_unlock(&server->req_lock);  			cifs_num_waiters_inc(server); -			wait_event(server->request_q, -				   atomic_read(&server->inFlight) -				     < cifs_max_pending); +			rc = wait_event_killable(server->request_q, +						 has_credits(server, credits));  			cifs_num_waiters_dec(server); -			spin_lock(&GlobalMid_Lock); +			if (rc) +				return rc; +			spin_lock(&server->req_lock);  		} else {  			if (server->tcpStatus == CifsExiting) { -				spin_unlock(&GlobalMid_Lock); +				spin_unlock(&server->req_lock);  				return -ENOENT;  			} -			/* can not count locking commands against total -			   as they are allowed to block on server */ +			/* +			 * Can not count locking commands against total +			 * as they are allowed to block on server. +			 */  			/* update # of requests on the wire to server */ -			if (long_op != CIFS_BLOCKING_OP) -				atomic_inc(&server->inFlight); -			spin_unlock(&GlobalMid_Lock); +			if (optype != CIFS_BLOCKING_OP) { +				*credits -= 1; +				server->in_flight++; +			} +			spin_unlock(&server->req_lock);  			break;  		}  	}  	return 0;  } +static int +wait_for_free_request(struct TCP_Server_Info *server, const int optype) +{ +	return wait_for_free_credits(server, optype, get_credits_field(server)); +} +  static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,  			struct mid_q_entry **ppmidQ)  { @@ -359,7 +375,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,  	mid = AllocMidQEntry(hdr, server);  	if (mid == NULL) {  		mutex_unlock(&server->srv_mutex); -		atomic_dec(&server->inFlight); +		cifs_add_credits(server, 1);  		wake_up(&server->request_q);  		return -ENOMEM;  	} @@ -392,7 +408,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,  	return rc;  out_err:  	delete_mid(mid); -	atomic_dec(&server->inFlight); +	cifs_add_credits(server, 1);  	wake_up(&server->request_q);  	return rc;  } @@ -564,8 +580,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,  		mutex_unlock(&ses->server->srv_mutex);  		cifs_small_buf_release(in_buf);  		/* Update # of requests on wire to server */ -		atomic_dec(&ses->server->inFlight); -		wake_up(&ses->server->request_q); +		cifs_add_credits(ses->server, 1);  		return rc;  	}  	rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number); @@ -601,8 +616,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,  			midQ->callback = DeleteMidQEntry;  			spin_unlock(&GlobalMid_Lock);  			cifs_small_buf_release(in_buf); -			atomic_dec(&ses->server->inFlight); -			wake_up(&ses->server->request_q); +			cifs_add_credits(ses->server, 1);  			return rc;  		}  		spin_unlock(&GlobalMid_Lock); @@ -612,8 +626,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,  	rc = cifs_sync_mid_result(midQ, ses->server);  	if (rc != 0) { -		atomic_dec(&ses->server->inFlight); -		wake_up(&ses->server->request_q); +		cifs_add_credits(ses->server, 1);  		return rc;  	} @@ -637,8 +650,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,  		midQ->resp_buf = NULL;  out:  	delete_mid(midQ); -	atomic_dec(&ses->server->inFlight); -	wake_up(&ses->server->request_q); +	cifs_add_credits(ses->server, 1);  	return rc;  } @@ -688,8 +700,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,  	if (rc) {  		mutex_unlock(&ses->server->srv_mutex);  		/* Update # of requests on wire to server */ -		atomic_dec(&ses->server->inFlight); -		wake_up(&ses->server->request_q); +		cifs_add_credits(ses->server, 1);  		return rc;  	} @@ -721,8 +732,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,  			/* no longer considered to be "in-flight" */  			midQ->callback = DeleteMidQEntry;  			spin_unlock(&GlobalMid_Lock); -			atomic_dec(&ses->server->inFlight); -			wake_up(&ses->server->request_q); +			cifs_add_credits(ses->server, 1);  			return rc;  		}  		spin_unlock(&GlobalMid_Lock); @@ -730,8 +740,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,  	rc = cifs_sync_mid_result(midQ, ses->server);  	if (rc != 0) { -		atomic_dec(&ses->server->inFlight); -		wake_up(&ses->server->request_q); +		cifs_add_credits(ses->server, 1);  		return rc;  	} @@ -747,8 +756,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,  	rc = cifs_check_receive(midQ, ses->server, 0);  out:  	delete_mid(midQ); -	atomic_dec(&ses->server->inFlight); -	wake_up(&ses->server->request_q); +	cifs_add_credits(ses->server, 1);  	return rc;  } diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 45f07c46f3e..10d92cf57ab 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -105,7 +105,6 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,  	struct cifs_tcon *pTcon;  	struct super_block *sb;  	char *full_path; -	struct cifs_ntsd *pacl;  	if (direntry == NULL)  		return -EIO; @@ -164,23 +163,24 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,  			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);  	} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,  			strlen(CIFS_XATTR_CIFS_ACL)) == 0) { +#ifdef CONFIG_CIFS_ACL +		struct cifs_ntsd *pacl;  		pacl = kmalloc(value_size, GFP_KERNEL);  		if (!pacl) {  			cFYI(1, "%s: Can't allocate memory for ACL",  					__func__);  			rc = -ENOMEM;  		} else { -#ifdef CONFIG_CIFS_ACL  			memcpy(pacl, ea_value, value_size);  			rc = set_cifs_acl(pacl, value_size,  				direntry->d_inode, full_path, CIFS_ACL_DACL);  			if (rc == 0) /* force revalidate of the inode */  				CIFS_I(direntry->d_inode)->time = 0;  			kfree(pacl); +		}  #else  			cFYI(1, "Set CIFS ACL not supported yet");  #endif /* CONFIG_CIFS_ACL */ -		}  	} else {  		int temp;  		temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 5e2e1b3f068..05156c17b55 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -208,13 +208,12 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)          if (IS_ERR(root)) {  		error = PTR_ERR(root);  		printk("Failure of coda_cnode_make for root: error %d\n", error); -		root = NULL;  		goto error;  	}   	printk("coda_read_super: rootinode is %ld dev %s\n",   	       root->i_ino, root->i_sb->s_id); -	sb->s_root = d_alloc_root(root); +	sb->s_root = d_make_root(root);  	if (!sb->s_root) {  		error = -EINVAL;  		goto error; @@ -222,9 +221,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)  	return 0;  error: -	if (root) -		iput(root); -  	mutex_lock(&vc->vc_mutex);  	bdi_destroy(&vc->bdi);  	vc->vc_sb = NULL; diff --git a/fs/compat.c b/fs/compat.c index fa9d721ecfe..14483a715bb 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -33,7 +33,6 @@  #include <linux/nfs4_mount.h>  #include <linux/syscalls.h>  #include <linux/ctype.h> -#include <linux/module.h>  #include <linux/dirent.h>  #include <linux/fsnotify.h>  #include <linux/highuid.h> @@ -131,41 +130,35 @@ asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_tim  static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)  { -	compat_ino_t ino = stat->ino; -	typeof(ubuf->st_uid) uid = 0; -	typeof(ubuf->st_gid) gid = 0; -	int err; +	struct compat_stat tmp; -	SET_UID(uid, stat->uid); -	SET_GID(gid, stat->gid); +	if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) +		return -EOVERFLOW; -	if ((u64) stat->size > MAX_NON_LFS || -	    !old_valid_dev(stat->dev) || -	    !old_valid_dev(stat->rdev)) +	memset(&tmp, 0, sizeof(tmp)); +	tmp.st_dev = old_encode_dev(stat->dev); +	tmp.st_ino = stat->ino; +	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)  		return -EOVERFLOW; -	if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) +	tmp.st_mode = stat->mode; +	tmp.st_nlink = stat->nlink; +	if (tmp.st_nlink != stat->nlink)  		return -EOVERFLOW; - -	if (clear_user(ubuf, sizeof(*ubuf))) -		return -EFAULT; - -	err  = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev); -	err |= __put_user(ino, &ubuf->st_ino); -	err |= __put_user(stat->mode, &ubuf->st_mode); -	err |= __put_user(stat->nlink, &ubuf->st_nlink); -	err |= __put_user(uid, &ubuf->st_uid); -	err |= __put_user(gid, &ubuf->st_gid); -	err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev); -	err |= __put_user(stat->size, &ubuf->st_size); -	err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime); -	err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec); -	err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime); -	err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec); -	err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime); -	err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec); -	err |= __put_user(stat->blksize, &ubuf->st_blksize); -	err |= __put_user(stat->blocks, &ubuf->st_blocks); -	return err; +	SET_UID(tmp.st_uid, stat->uid); +	SET_GID(tmp.st_gid, stat->gid); +	tmp.st_rdev = old_encode_dev(stat->rdev); +	if ((u64) stat->size > MAX_NON_LFS) +		return -EOVERFLOW; +	tmp.st_size = stat->size; +	tmp.st_atime = stat->atime.tv_sec; +	tmp.st_atime_nsec = stat->atime.tv_nsec; +	tmp.st_mtime = stat->mtime.tv_sec; +	tmp.st_mtime_nsec = stat->mtime.tv_nsec; +	tmp.st_ctime = stat->ctime.tv_sec; +	tmp.st_ctime_nsec = stat->ctime.tv_nsec; +	tmp.st_blocks = stat->blocks; +	tmp.st_blksize = stat->blksize; +	return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;  }  asmlinkage long compat_sys_newstat(const char __user * filename, diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index a26bea10e81..debdfe0fc80 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -34,7 +34,7 @@  #include <linux/fs.h>  #include <linux/file.h>  #include <linux/ppp_defs.h> -#include <linux/if_ppp.h> +#include <linux/ppp-ioctl.h>  #include <linux/if_pppox.h>  #include <linux/mtio.h>  #include <linux/auto_fs.h> @@ -49,7 +49,6 @@  #include <linux/elevator.h>  #include <linux/rtc.h>  #include <linux/pci.h> -#include <linux/module.h>  #include <linux/serial.h>  #include <linux/if_tun.h>  #include <linux/ctype.h> diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index ede857d20a0..b5f0a3b91f1 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -58,12 +58,11 @@ struct configfs_dirent {  extern struct mutex configfs_symlink_mutex;  extern spinlock_t configfs_dirent_lock; -extern struct vfsmount * configfs_mount;  extern struct kmem_cache *configfs_dir_cachep;  extern int configfs_is_root(struct config_item *item); -extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *); +extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);  extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));  extern int configfs_inode_init(void);  extern void configfs_inode_exit(void); @@ -80,15 +79,15 @@ extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);  extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);  extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr); -extern int configfs_pin_fs(void); +extern struct dentry *configfs_pin_fs(void);  extern void configfs_release_fs(void);  extern struct rw_semaphore configfs_rename_sem; -extern struct super_block * configfs_sb;  extern const struct file_operations configfs_dir_operations;  extern const struct file_operations configfs_file_operations;  extern const struct file_operations bin_fops;  extern const struct inode_operations configfs_dir_inode_operations; +extern const struct inode_operations configfs_root_inode_operations;  extern const struct inode_operations configfs_symlink_inode_operations;  extern const struct dentry_operations configfs_dentry_ops; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 5ddd7ebd9dc..7e6c52d8a20 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -264,11 +264,13 @@ static int init_symlink(struct inode * inode)  	return 0;  } -static int create_dir(struct config_item * k, struct dentry * p, -		      struct dentry * d) +static int create_dir(struct config_item *k, struct dentry *d)  {  	int error;  	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; +	struct dentry *p = d->d_parent; + +	BUG_ON(!k);  	error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);  	if (!error) @@ -304,19 +306,7 @@ static int create_dir(struct config_item * k, struct dentry * p,  static int configfs_create_dir(struct config_item * item, struct dentry *dentry)  { -	struct dentry * parent; -	int error = 0; - -	BUG_ON(!item); - -	if (item->ci_parent) -		parent = item->ci_parent->ci_dentry; -	else if (configfs_mount) -		parent = configfs_mount->mnt_root; -	else -		return -EFAULT; - -	error = create_dir(item,parent,dentry); +	int error = create_dir(item, dentry);  	if (!error)  		item->ci_dentry = dentry;  	return error; @@ -1079,23 +1069,24 @@ int configfs_depend_item(struct configfs_subsystem *subsys,  	int ret;  	struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;  	struct config_item *s_item = &subsys->su_group.cg_item; +	struct dentry *root;  	/*  	 * Pin the configfs filesystem.  This means we can safely access  	 * the root of the configfs filesystem.  	 */ -	ret = configfs_pin_fs(); -	if (ret) -		return ret; +	root = configfs_pin_fs(); +	if (IS_ERR(root)) +		return PTR_ERR(root);  	/*  	 * Next, lock the root directory.  We're going to check that the  	 * subsystem is really registered, and so we need to lock out  	 * configfs_[un]register_subsystem().  	 */ -	mutex_lock(&configfs_sb->s_root->d_inode->i_mutex); +	mutex_lock(&root->d_inode->i_mutex); -	root_sd = configfs_sb->s_root->d_fsdata; +	root_sd = root->d_fsdata;  	list_for_each_entry(p, &root_sd->s_children, s_sibling) {  		if (p->s_type & CONFIGFS_DIR) { @@ -1129,7 +1120,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys,  out_unlock_dirent_lock:  	spin_unlock(&configfs_dirent_lock);  out_unlock_fs: -	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); +	mutex_unlock(&root->d_inode->i_mutex);  	/*  	 * If we succeeded, the fs is pinned via other methods.  If not, @@ -1183,11 +1174,6 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode  	struct module *subsys_owner = NULL, *new_item_owner = NULL;  	char *name; -	if (dentry->d_parent == configfs_sb->s_root) { -		ret = -EPERM; -		goto out; -	} -  	sd = dentry->d_parent->d_fsdata;  	/* @@ -1359,9 +1345,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)  	struct module *subsys_owner = NULL, *dead_item_owner = NULL;  	int ret; -	if (dentry->d_parent == configfs_sb->s_root) -		return -EPERM; -  	sd = dentry->d_fsdata;  	if (sd->s_type & CONFIGFS_USET_DEFAULT)  		return -EPERM; @@ -1459,6 +1442,11 @@ const struct inode_operations configfs_dir_inode_operations = {  	.setattr	= configfs_setattr,  }; +const struct inode_operations configfs_root_inode_operations = { +	.lookup		= configfs_lookup, +	.setattr	= configfs_setattr, +}; +  #if 0  int configfs_rename_dir(struct config_item * item, const char *new_name)  { @@ -1546,6 +1534,7 @@ static inline unsigned char dt_type(struct configfs_dirent *sd)  static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)  {  	struct dentry *dentry = filp->f_path.dentry; +	struct super_block *sb = dentry->d_sb;  	struct configfs_dirent * parent_sd = dentry->d_fsdata;  	struct configfs_dirent *cursor = filp->private_data;  	struct list_head *p, *q = &cursor->s_sibling; @@ -1608,7 +1597,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir  					ino = inode->i_ino;  				spin_unlock(&configfs_dirent_lock);  				if (!inode) -					ino = iunique(configfs_sb, 2); +					ino = iunique(sb, 2);  				if (filldir(dirent, name, len, filp->f_pos, ino,  						 dt_type(next)) < 0) @@ -1680,27 +1669,27 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)  	struct config_group *group = &subsys->su_group;  	struct qstr name;  	struct dentry *dentry; +	struct dentry *root;  	struct configfs_dirent *sd; -	err = configfs_pin_fs(); -	if (err) -		return err; +	root = configfs_pin_fs(); +	if (IS_ERR(root)) +		return PTR_ERR(root);  	if (!group->cg_item.ci_name)  		group->cg_item.ci_name = group->cg_item.ci_namebuf; -	sd = configfs_sb->s_root->d_fsdata; +	sd = root->d_fsdata;  	link_group(to_config_group(sd->s_element), group); -	mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex, -			I_MUTEX_PARENT); +	mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);  	name.name = group->cg_item.ci_name;  	name.len = strlen(name.name);  	name.hash = full_name_hash(name.name, name.len);  	err = -ENOMEM; -	dentry = d_alloc(configfs_sb->s_root, &name); +	dentry = d_alloc(root, &name);  	if (dentry) {  		d_add(dentry, NULL); @@ -1717,7 +1706,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)  		}  	} -	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); +	mutex_unlock(&root->d_inode->i_mutex);  	if (err) {  		unlink_group(group); @@ -1731,13 +1720,14 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)  {  	struct config_group *group = &subsys->su_group;  	struct dentry *dentry = group->cg_item.ci_dentry; +	struct dentry *root = dentry->d_sb->s_root; -	if (dentry->d_parent != configfs_sb->s_root) { +	if (dentry->d_parent != root) {  		printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");  		return;  	} -	mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex, +	mutex_lock_nested(&root->d_inode->i_mutex,  			  I_MUTEX_PARENT);  	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);  	mutex_lock(&configfs_symlink_mutex); @@ -1754,7 +1744,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)  	d_delete(dentry); -	mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); +	mutex_unlock(&root->d_inode->i_mutex);  	dput(dentry); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 3ee36d41886..0074362d9f7 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -44,8 +44,6 @@  static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];  #endif -extern struct super_block * configfs_sb; -  static const struct address_space_operations configfs_aops = {  	.readpage	= simple_readpage,  	.write_begin	= simple_write_begin, @@ -132,9 +130,10 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)  	inode->i_ctime = iattr->ia_ctime;  } -struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent * sd) +struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, +				 struct super_block *s)  { -	struct inode * inode = new_inode(configfs_sb); +	struct inode * inode = new_inode(s);  	if (inode) {  		inode->i_ino = get_next_ino();  		inode->i_mapping->a_ops = &configfs_aops; @@ -188,36 +187,35 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,  int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct inode *))  {  	int error = 0; -	struct inode * inode = NULL; -	if (dentry) { -		if (!dentry->d_inode) { -			struct configfs_dirent *sd = dentry->d_fsdata; -			if ((inode = configfs_new_inode(mode, sd))) { -				if (dentry->d_parent && dentry->d_parent->d_inode) { -					struct inode *p_inode = dentry->d_parent->d_inode; -					p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; -				} -				configfs_set_inode_lock_class(sd, inode); -				goto Proceed; -			} -			else -				error = -ENOMEM; -		} else -			error = -EEXIST; -	} else -		error = -ENOENT; -	goto Done; +	struct inode *inode = NULL; +	struct configfs_dirent *sd; +	struct inode *p_inode; + +	if (!dentry) +		return -ENOENT; + +	if (dentry->d_inode) +		return -EEXIST; - Proceed: -	if (init) +	sd = dentry->d_fsdata; +	inode = configfs_new_inode(mode, sd, dentry->d_sb); +	if (!inode) +		return -ENOMEM; + +	p_inode = dentry->d_parent->d_inode; +	p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; +	configfs_set_inode_lock_class(sd, inode); + +	if (init) {  		error = init(inode); -	if (!error) { -		d_instantiate(dentry, inode); -		if (S_ISDIR(mode) || S_ISLNK(mode)) -			dget(dentry);  /* pin link and directory dentries in core */ -	} else -		iput(inode); - Done: +		if (error) { +			iput(inode); +			return error; +		} +	} +	d_instantiate(dentry, inode); +	if (S_ISDIR(mode) || S_ISLNK(mode)) +		dget(dentry);  /* pin link and directory dentries in core */  	return error;  } diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 276e15cafd5..aee0a7ebbd8 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -37,8 +37,7 @@  /* Random magic number */  #define CONFIGFS_MAGIC 0x62656570 -struct vfsmount * configfs_mount = NULL; -struct super_block * configfs_sb = NULL; +static struct vfsmount *configfs_mount = NULL;  struct kmem_cache *configfs_dir_cachep;  static int configfs_mnt_count = 0; @@ -77,12 +76,11 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)  	sb->s_magic = CONFIGFS_MAGIC;  	sb->s_op = &configfs_ops;  	sb->s_time_gran = 1; -	configfs_sb = sb;  	inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, -				   &configfs_root); +				   &configfs_root, sb);  	if (inode) { -		inode->i_op = &configfs_dir_inode_operations; +		inode->i_op = &configfs_root_inode_operations;  		inode->i_fop = &configfs_dir_operations;  		/* directory inodes start off with i_nlink == 2 (for "." entry) */  		inc_nlink(inode); @@ -91,10 +89,9 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)  		return -ENOMEM;  	} -	root = d_alloc_root(inode); +	root = d_make_root(inode);  	if (!root) {  		pr_debug("%s: could not get root dentry!\n",__func__); -		iput(inode);  		return -ENOMEM;  	}  	config_group_init(&configfs_root_group); @@ -118,10 +115,11 @@ static struct file_system_type configfs_fs_type = {  	.kill_sb	= kill_litter_super,  }; -int configfs_pin_fs(void) +struct dentry *configfs_pin_fs(void)  { -	return simple_pin_fs(&configfs_fs_type, &configfs_mount, +	int err = simple_pin_fs(&configfs_fs_type, &configfs_mount,  			     &configfs_mnt_count); +	return err ? ERR_PTR(err) : configfs_mount->mnt_root;  }  void configfs_release_fs(void) diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 0f3eb41d920..cc9f2546ea4 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -110,13 +110,13 @@ out:  static int get_target(const char *symname, struct path *path, -		      struct config_item **target) +		      struct config_item **target, struct super_block *sb)  {  	int ret;  	ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path);  	if (!ret) { -		if (path->dentry->d_sb == configfs_sb) { +		if (path->dentry->d_sb == sb) {  			*target = configfs_get_config_item(path->dentry);  			if (!*target) {  				ret = -ENOENT; @@ -141,10 +141,6 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna  	struct config_item *target_item = NULL;  	struct config_item_type *type; -	ret = -EPERM;  /* What lack-of-symlink returns */ -	if (dentry->d_parent == configfs_sb->s_root) -		goto out; -  	sd = dentry->d_parent->d_fsdata;  	/*  	 * Fake invisibility if dir belongs to a group/default groups hierarchy @@ -162,7 +158,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna  	    !type->ct_item_ops->allow_link)  		goto out_put; -	ret = get_target(symname, &path, &target_item); +	ret = get_target(symname, &path, &target_item, dentry->d_sb);  	if (ret)  		goto out_put; @@ -198,8 +194,6 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)  	if (!(sd->s_type & CONFIGFS_ITEM_LINK))  		goto out; -	BUG_ON(dentry->d_parent == configfs_sb->s_root); -  	sl = sd->s_element;  	parent_item = configfs_get_config_item(dentry->d_parent); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index a2ee8f9f5a3..d013c46402e 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -257,10 +257,10 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)  	/* Do sanity checks on the superblock */  	if (super.magic != CRAMFS_MAGIC) { -		/* check for wrong endianess */ +		/* check for wrong endianness */  		if (super.magic == CRAMFS_MAGIC_WEND) {  			if (!silent) -				printk(KERN_ERR "cramfs: wrong endianess\n"); +				printk(KERN_ERR "cramfs: wrong endianness\n");  			goto out;  		} @@ -270,7 +270,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)  		mutex_unlock(&read_mutex);  		if (super.magic != CRAMFS_MAGIC) {  			if (super.magic == CRAMFS_MAGIC_WEND && !silent) -				printk(KERN_ERR "cramfs: wrong endianess\n"); +				printk(KERN_ERR "cramfs: wrong endianness\n");  			else if (!silent)  				printk(KERN_ERR "cramfs: wrong magic\n");  			goto out; @@ -318,11 +318,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)  	root = get_cramfs_inode(sb, &super.root, 0);  	if (IS_ERR(root))  		goto out; -	sb->s_root = d_alloc_root(root); -	if (!sb->s_root) { -		iput(root); +	sb->s_root = d_make_root(root); +	if (!sb->s_root)  		goto out; -	}  	return 0;  out:  	kfree(sbi); diff --git a/fs/dcache.c b/fs/dcache.c index 16a53cc2cc0..b60ddc41d78 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -23,7 +23,7 @@  #include <linux/init.h>  #include <linux/hash.h>  #include <linux/cache.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/mount.h>  #include <linux/file.h>  #include <asm/uaccess.h> @@ -104,11 +104,11 @@ static unsigned int d_hash_shift __read_mostly;  static struct hlist_bl_head *dentry_hashtable __read_mostly; -static inline struct hlist_bl_head *d_hash(struct dentry *parent, -					unsigned long hash) +static inline struct hlist_bl_head *d_hash(const struct dentry *parent, +					unsigned int hash)  { -	hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; -	hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS); +	hash += (unsigned long) parent / L1_CACHE_BYTES; +	hash = hash + (hash >> D_HASHBITS);  	return dentry_hashtable + (hash & D_HASHMASK);  } @@ -137,6 +137,49 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,  }  #endif +/* + * Compare 2 name strings, return 0 if they match, otherwise non-zero. + * The strings are both count bytes long, and count is non-zero. + */ +static inline int dentry_cmp(const unsigned char *cs, size_t scount, +				const unsigned char *ct, size_t tcount) +{ +#ifdef CONFIG_DCACHE_WORD_ACCESS +	unsigned long a,b,mask; + +	if (unlikely(scount != tcount)) +		return 1; + +	for (;;) { +		a = *(unsigned long *)cs; +		b = *(unsigned long *)ct; +		if (tcount < sizeof(unsigned long)) +			break; +		if (unlikely(a != b)) +			return 1; +		cs += sizeof(unsigned long); +		ct += sizeof(unsigned long); +		tcount -= sizeof(unsigned long); +		if (!tcount) +			return 0; +	} +	mask = ~(~0ul << tcount*8); +	return unlikely(!!((a ^ b) & mask)); +#else +	if (scount != tcount) +		return 1; + +	do { +		if (*cs != *ct) +			return 1; +		cs++; +		ct++; +		tcount--; +	} while (tcount); +	return 0; +#endif +} +  static void __d_free(struct rcu_head *head)  {  	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); @@ -1423,30 +1466,6 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)  EXPORT_SYMBOL(d_instantiate_unique); -/** - * d_alloc_root - allocate root dentry - * @root_inode: inode to allocate the root for - * - * Allocate a root ("/") dentry for the inode given. The inode is - * instantiated and returned. %NULL is returned if there is insufficient - * memory or the inode passed is %NULL. - */ -  -struct dentry * d_alloc_root(struct inode * root_inode) -{ -	struct dentry *res = NULL; - -	if (root_inode) { -		static const struct qstr name = { .name = "/", .len = 1 }; - -		res = __d_alloc(root_inode->i_sb, &name); -		if (res) -			d_instantiate(res, root_inode); -	} -	return res; -} -EXPORT_SYMBOL(d_alloc_root); -  struct dentry *d_make_root(struct inode *root_inode)  {  	struct dentry *res = NULL; @@ -1694,7 +1713,7 @@ EXPORT_SYMBOL(d_add_ci);   * __d_lookup_rcu - search for a dentry (racy, store-free)   * @parent: parent dentry   * @name: qstr of name we wish to find - * @seq: returns d_seq value at the point where the dentry was found + * @seqp: returns d_seq value at the point where the dentry was found   * @inode: returns dentry->d_inode when the inode was found valid.   * Returns: dentry, or NULL   * @@ -1717,8 +1736,9 @@ EXPORT_SYMBOL(d_add_ci);   * child is looked up. Thus, an interlocking stepping of sequence lock checks   * is formed, giving integrity down the path walk.   */ -struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, -				unsigned *seq, struct inode **inode) +struct dentry *__d_lookup_rcu(const struct dentry *parent, +				const struct qstr *name, +				unsigned *seqp, struct inode **inode)  {  	unsigned int len = name->len;  	unsigned int hash = name->hash; @@ -1748,6 +1768,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,  	 * See Documentation/filesystems/path-lookup.txt for more details.  	 */  	hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { +		unsigned seq;  		struct inode *i;  		const char *tname;  		int tlen; @@ -1756,7 +1777,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,  			continue;  seqretry: -		*seq = read_seqcount_begin(&dentry->d_seq); +		seq = read_seqcount_begin(&dentry->d_seq);  		if (dentry->d_parent != parent)  			continue;  		if (d_unhashed(dentry)) @@ -1771,7 +1792,7 @@ seqretry:  		 * edge of memory when walking. If we could load this  		 * atomically some other way, we could drop this check.  		 */ -		if (read_seqcount_retry(&dentry->d_seq, *seq)) +		if (read_seqcount_retry(&dentry->d_seq, seq))  			goto seqretry;  		if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {  			if (parent->d_op->d_compare(parent, *inode, @@ -1788,6 +1809,7 @@ seqretry:  		 * order to do anything useful with the returned dentry  		 * anyway.  		 */ +		*seqp = seq;  		*inode = i;  		return dentry;  	} @@ -2382,6 +2404,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)  			if (d_ancestor(alias, dentry)) {  				/* Check for loops */  				actual = ERR_PTR(-ELOOP); +				spin_unlock(&inode->i_lock);  			} else if (IS_ROOT(alias)) {  				/* Is this an anonymous mountpoint that we  				 * could splice into our tree? */ @@ -2391,7 +2414,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)  				goto found;  			} else {  				/* Nope, but we must(!) avoid directory -				 * aliasing */ +				 * aliasing. This drops inode->i_lock */  				actual = __d_unalias(inode, dentry, alias);  			}  			write_sequnlock(&rename_lock); @@ -2968,7 +2991,7 @@ __setup("dhash_entries=", set_dhash_entries);  static void __init dcache_init_early(void)  { -	int loop; +	unsigned int loop;  	/* If hashes are distributed across NUMA nodes, defer  	 * hash allocation until vmalloc space is available. @@ -2986,13 +3009,13 @@ static void __init dcache_init_early(void)  					&d_hash_mask,  					0); -	for (loop = 0; loop < (1 << d_hash_shift); loop++) +	for (loop = 0; loop < (1U << d_hash_shift); loop++)  		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);  }  static void __init dcache_init(void)  { -	int loop; +	unsigned int loop;  	/*   	 * A constructor could be added for stable state like the lists, @@ -3016,7 +3039,7 @@ static void __init dcache_init(void)  					&d_hash_mask,  					0); -	for (loop = 0; loop < (1 << d_hash_shift); loop++) +	for (loop = 0; loop < (1U << d_hash_shift); loop++)  		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);  } diff --git a/fs/dcookies.c b/fs/dcookies.c index dda0dc702d1..17c77996782 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -13,7 +13,7 @@   */  #include <linux/syscalls.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/slab.h>  #include <linux/list.h>  #include <linux/mount.h> diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index f65d4455c5e..21e93605161 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_blob);   * debugfs_print_regs32 - use seq_print to describe a set of registers   * @s: the seq_file structure being used to generate output   * @regs: an array if struct debugfs_reg32 structures - * @mregs: the length of the above array + * @nregs: the length of the above array   * @base: the base address to be used in reading the registers   * @prefix: a string to be prefixed to every output line   * @@ -611,7 +611,7 @@ static const struct file_operations fops_regset32 = {   * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling   * code.   */ -struct dentry *debugfs_create_regset32(const char *name, mode_t mode, +struct dentry *debugfs_create_regset32(const char *name, umode_t mode,  				       struct dentry *parent,  				       struct debugfs_regset32 *regset)  { diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 956d5ddddf6..b80bc846a15 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -23,9 +23,13 @@  #include <linux/debugfs.h>  #include <linux/fsnotify.h>  #include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/parser.h>  #include <linux/magic.h>  #include <linux/slab.h> +#define DEBUGFS_DEFAULT_MODE	0755 +  static struct vfsmount *debugfs_mount;  static int debugfs_mount_count;  static bool debugfs_registered; @@ -125,11 +129,154 @@ static inline int debugfs_positive(struct dentry *dentry)  	return dentry->d_inode && !d_unhashed(dentry);  } +struct debugfs_mount_opts { +	uid_t uid; +	gid_t gid; +	umode_t mode; +}; + +enum { +	Opt_uid, +	Opt_gid, +	Opt_mode, +	Opt_err +}; + +static const match_table_t tokens = { +	{Opt_uid, "uid=%u"}, +	{Opt_gid, "gid=%u"}, +	{Opt_mode, "mode=%o"}, +	{Opt_err, NULL} +}; + +struct debugfs_fs_info { +	struct debugfs_mount_opts mount_opts; +}; + +static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) +{ +	substring_t args[MAX_OPT_ARGS]; +	int option; +	int token; +	char *p; + +	opts->mode = DEBUGFS_DEFAULT_MODE; + +	while ((p = strsep(&data, ",")) != NULL) { +		if (!*p) +			continue; + +		token = match_token(p, tokens, args); +		switch (token) { +		case Opt_uid: +			if (match_int(&args[0], &option)) +				return -EINVAL; +			opts->uid = option; +			break; +		case Opt_gid: +			if (match_octal(&args[0], &option)) +				return -EINVAL; +			opts->gid = option; +			break; +		case Opt_mode: +			if (match_octal(&args[0], &option)) +				return -EINVAL; +			opts->mode = option & S_IALLUGO; +			break; +		/* +		 * We might like to report bad mount options here; +		 * but traditionally debugfs has ignored all mount options +		 */ +		} +	} + +	return 0; +} + +static int debugfs_apply_options(struct super_block *sb) +{ +	struct debugfs_fs_info *fsi = sb->s_fs_info; +	struct inode *inode = sb->s_root->d_inode; +	struct debugfs_mount_opts *opts = &fsi->mount_opts; + +	inode->i_mode &= ~S_IALLUGO; +	inode->i_mode |= opts->mode; + +	inode->i_uid = opts->uid; +	inode->i_gid = opts->gid; + +	return 0; +} + +static int debugfs_remount(struct super_block *sb, int *flags, char *data) +{ +	int err; +	struct debugfs_fs_info *fsi = sb->s_fs_info; + +	err = debugfs_parse_options(data, &fsi->mount_opts); +	if (err) +		goto fail; + +	debugfs_apply_options(sb); + +fail: +	return err; +} + +static int debugfs_show_options(struct seq_file *m, struct dentry *root) +{ +	struct debugfs_fs_info *fsi = root->d_sb->s_fs_info; +	struct debugfs_mount_opts *opts = &fsi->mount_opts; + +	if (opts->uid != 0) +		seq_printf(m, ",uid=%u", opts->uid); +	if (opts->gid != 0) +		seq_printf(m, ",gid=%u", opts->gid); +	if (opts->mode != DEBUGFS_DEFAULT_MODE) +		seq_printf(m, ",mode=%o", opts->mode); + +	return 0; +} + +static const struct super_operations debugfs_super_operations = { +	.statfs		= simple_statfs, +	.remount_fs	= debugfs_remount, +	.show_options	= debugfs_show_options, +}; +  static int debug_fill_super(struct super_block *sb, void *data, int silent)  {  	static struct tree_descr debug_files[] = {{""}}; +	struct debugfs_fs_info *fsi; +	int err; + +	save_mount_options(sb, data); + +	fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL); +	sb->s_fs_info = fsi; +	if (!fsi) { +		err = -ENOMEM; +		goto fail; +	} + +	err = debugfs_parse_options(data, &fsi->mount_opts); +	if (err) +		goto fail; + +	err  =  simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); +	if (err) +		goto fail; + +	sb->s_op = &debugfs_super_operations; + +	debugfs_apply_options(sb); + +	return 0; -	return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); +fail: +	kfree(fsi); +	sb->s_fs_info = NULL; +	return err;  }  static struct dentry *debug_mount(struct file_system_type *fs_type, diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c4e2a58a2e8..10f5e0b484d 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -36,7 +36,61 @@  #define DEVPTS_DEFAULT_PTMX_MODE 0000  #define PTMX_MINOR	2 -extern int pty_limit;			/* Config limit on Unix98 ptys */ +/* + * sysctl support for setting limits on the number of Unix98 ptys allocated. + * Otherwise one can eat up all kernel memory by opening /dev/ptmx repeatedly. + */ +static int pty_limit = NR_UNIX98_PTY_DEFAULT; +static int pty_reserve = NR_UNIX98_PTY_RESERVE; +static int pty_limit_min; +static int pty_limit_max = INT_MAX; +static int pty_count; + +static struct ctl_table pty_table[] = { +	{ +		.procname	= "max", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.data		= &pty_limit, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &pty_limit_min, +		.extra2		= &pty_limit_max, +	}, { +		.procname	= "reserve", +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.data		= &pty_reserve, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &pty_limit_min, +		.extra2		= &pty_limit_max, +	}, { +		.procname	= "nr", +		.maxlen		= sizeof(int), +		.mode		= 0444, +		.data		= &pty_count, +		.proc_handler	= proc_dointvec, +	}, +	{} +}; + +static struct ctl_table pty_kern_table[] = { +	{ +		.procname	= "pty", +		.mode		= 0555, +		.child		= pty_table, +	}, +	{} +}; + +static struct ctl_table pty_root_table[] = { +	{ +		.procname	= "kernel", +		.mode		= 0555, +		.child		= pty_kern_table, +	}, +	{} +}; +  static DEFINE_MUTEX(allocated_ptys_lock);  static struct vfsmount *devpts_mnt; @@ -49,10 +103,11 @@ struct pts_mount_opts {  	umode_t mode;  	umode_t ptmxmode;  	int newinstance; +	int max;  };  enum { -	Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance, +	Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,  Opt_max,  	Opt_err  }; @@ -63,6 +118,7 @@ static const match_table_t tokens = {  #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES  	{Opt_ptmxmode, "ptmxmode=%o"},  	{Opt_newinstance, "newinstance"}, +	{Opt_max, "max=%d"},  #endif  	{Opt_err, NULL}  }; @@ -109,6 +165,7 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)  	opts->gid     = 0;  	opts->mode    = DEVPTS_DEFAULT_MODE;  	opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; +	opts->max     = NR_UNIX98_PTY_MAX;  	/* newinstance makes sense only on initial mount */  	if (op == PARSE_MOUNT) @@ -152,6 +209,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)  			if (op == PARSE_MOUNT)  				opts->newinstance = 1;  			break; +		case Opt_max: +			if (match_int(&args[0], &option) || +			    option < 0 || option > NR_UNIX98_PTY_MAX) +				return -EINVAL; +			opts->max = option; +			break;  #endif  		default:  			printk(KERN_ERR "devpts: called with bogus options\n"); @@ -258,6 +321,8 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)  	seq_printf(seq, ",mode=%03o", opts->mode);  #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES  	seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); +	if (opts->max < NR_UNIX98_PTY_MAX) +		seq_printf(seq, ",max=%d", opts->max);  #endif  	return 0; @@ -309,12 +374,11 @@ devpts_fill_super(struct super_block *s, void *data, int silent)  	inode->i_fop = &simple_dir_operations;  	set_nlink(inode, 2); -	s->s_root = d_alloc_root(inode); +	s->s_root = d_make_root(inode);  	if (s->s_root)  		return 0;  	printk(KERN_ERR "devpts: get root dentry failed\n"); -	iput(inode);  fail:  	return -ENOMEM; @@ -438,6 +502,12 @@ retry:  		return -ENOMEM;  	mutex_lock(&allocated_ptys_lock); +	if (pty_count >= pty_limit - +			(fsi->mount_opts.newinstance ? pty_reserve : 0)) { +		mutex_unlock(&allocated_ptys_lock); +		return -ENOSPC; +	} +  	ida_ret = ida_get_new(&fsi->allocated_ptys, &index);  	if (ida_ret < 0) {  		mutex_unlock(&allocated_ptys_lock); @@ -446,11 +516,12 @@ retry:  		return -EIO;  	} -	if (index >= pty_limit) { +	if (index >= fsi->mount_opts.max) {  		ida_remove(&fsi->allocated_ptys, index);  		mutex_unlock(&allocated_ptys_lock); -		return -EIO; +		return -ENOSPC;  	} +	pty_count++;  	mutex_unlock(&allocated_ptys_lock);  	return index;  } @@ -462,6 +533,7 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)  	mutex_lock(&allocated_ptys_lock);  	ida_remove(&fsi->allocated_ptys, idx); +	pty_count--;  	mutex_unlock(&allocated_ptys_lock);  } @@ -558,11 +630,15 @@ void devpts_pty_kill(struct tty_struct *tty)  static int __init init_devpts_fs(void)  {  	int err = register_filesystem(&devpts_fs_type); +	struct ctl_table_header *table; +  	if (!err) { +		table = register_sysctl_table(pty_root_table);  		devpts_mnt = kern_mount(&devpts_fs_type);  		if (IS_ERR(devpts_mnt)) {  			err = PTR_ERR(devpts_mnt);  			unregister_filesystem(&devpts_fs_type); +			unregister_sysctl_table(table);  		}  	}  	return err; diff --git a/fs/direct-io.c b/fs/direct-io.c index 4a588dbd11b..f4aadd15b61 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -173,7 +173,7 @@ void inode_dio_wait(struct inode *inode)  	if (atomic_read(&inode->i_dio_count))  		__inode_dio_wait(inode);  } -EXPORT_SYMBOL_GPL(inode_dio_wait); +EXPORT_SYMBOL(inode_dio_wait);  /*   * inode_dio_done - signal finish of a direct I/O requests @@ -187,7 +187,7 @@ void inode_dio_done(struct inode *inode)  	if (atomic_dec_and_test(&inode->i_dio_count))  		wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);  } -EXPORT_SYMBOL_GPL(inode_dio_done); +EXPORT_SYMBOL(inode_dio_done);  /*   * How many pages are in the queue? diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 83641574b01..dc5eb598b81 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -351,11 +351,28 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,  static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)  {  	struct dlm_rsb *r; +	uint32_t hash, bucket; +	int rv; + +	hash = jhash(name, len, 0); +	bucket = hash & (ls->ls_rsbtbl_size - 1); + +	spin_lock(&ls->ls_rsbtbl[bucket].lock); +	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].keep, name, len, 0, &r); +	if (rv) +		rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[bucket].toss, +					 name, len, 0, &r); +	spin_unlock(&ls->ls_rsbtbl[bucket].lock); + +	if (!rv) +		return r;  	down_read(&ls->ls_root_sem);  	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {  		if (len == r->res_length && !memcmp(name, r->res_name, len)) {  			up_read(&ls->ls_root_sem); +			log_error(ls, "find_rsb_root revert to root_list %s", +				  r->res_name);  			return r;  		}  	} diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index d47183043c5..fa5c07d51dc 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -411,8 +411,8 @@ static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)  	return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);  } -static int search_rsb_tree(struct rb_root *tree, char *name, int len, -			   unsigned int flags, struct dlm_rsb **r_ret) +int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, +			unsigned int flags, struct dlm_rsb **r_ret)  {  	struct rb_node *node = tree->rb_node;  	struct dlm_rsb *r; @@ -474,12 +474,12 @@ static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,  	struct dlm_rsb *r;  	int error; -	error = search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r); +	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r);  	if (!error) {  		kref_get(&r->res_ref);  		goto out;  	} -	error = search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r); +	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);  	if (error)  		goto out; diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 265017a7c3e..1a255307f6f 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -28,6 +28,9 @@ void dlm_scan_waiters(struct dlm_ls *ls);  void dlm_scan_timeout(struct dlm_ls *ls);  void dlm_adjust_timeouts(struct dlm_ls *ls); +int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len, +			unsigned int flags, struct dlm_rsb **r_ret); +  int dlm_purge_locks(struct dlm_ls *ls);  void dlm_purge_mstcpy_locks(struct dlm_rsb *r);  void dlm_grant_after_purge(struct dlm_ls *ls); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 0b3109ee425..133ef6dc7cb 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -52,6 +52,7 @@  #include <linux/mutex.h>  #include <linux/sctp.h>  #include <linux/slab.h> +#include <net/sctp/sctp.h>  #include <net/sctp/user.h>  #include <net/ipv6.h> @@ -474,9 +475,6 @@ static void process_sctp_notification(struct connection *con,  			int prim_len, ret;  			int addr_len;  			struct connection *new_con; -			sctp_peeloff_arg_t parg; -			int parglen = sizeof(parg); -			int err;  			/*  			 * We get this before any data for an association. @@ -525,23 +523,19 @@ static void process_sctp_notification(struct connection *con,  				return;  			/* Peel off a new sock */ -			parg.associd = sn->sn_assoc_change.sac_assoc_id; -			ret = kernel_getsockopt(con->sock, IPPROTO_SCTP, -						SCTP_SOCKOPT_PEELOFF, -						(void *)&parg, &parglen); +			sctp_lock_sock(con->sock->sk); +			ret = sctp_do_peeloff(con->sock->sk, +				sn->sn_assoc_change.sac_assoc_id, +				&new_con->sock); +			sctp_release_sock(con->sock->sk);  			if (ret < 0) {  				log_print("Can't peel off a socket for "  					  "connection %d to node %d: err=%d", -					  parg.associd, nodeid, ret); -				return; -			} -			new_con->sock = sockfd_lookup(parg.sd, &err); -			if (!new_con->sock) { -				log_print("sockfd_lookup error %d", err); +					  (int)sn->sn_assoc_change.sac_assoc_id, +					  nodeid, ret);  				return;  			}  			add_sock(new_con->sock, new_con); -			sockfd_put(new_con->sock);  			log_print("connecting to %d sctp association %d",  				 nodeid, (int)sn->sn_assoc_change.sac_assoc_id); @@ -1082,7 +1076,7 @@ static void init_local(void)  	int i;  	dlm_local_count = 0; -	for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) { +	for (i = 0; i < DLM_MAX_ADDR_COUNT; i++) {  		if (dlm_our_addr(&sas, i))  			break; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 2a834255c75..ea993128155 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -417,17 +417,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,  			(unsigned long long)(extent_base + extent_offset), rc);  		goto out;  	} -	if (unlikely(ecryptfs_verbosity > 0)) { -		ecryptfs_printk(KERN_DEBUG, "Encrypting extent " -				"with iv:\n"); -		ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes); -		ecryptfs_printk(KERN_DEBUG, "First 8 bytes before " -				"encryption:\n"); -		ecryptfs_dump_hex((char *) -				  (page_address(page) -				   + (extent_offset * crypt_stat->extent_size)), -				  8); -	}  	rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,  					  page, (extent_offset  						 * crypt_stat->extent_size), @@ -440,14 +429,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,  		goto out;  	}  	rc = 0; -	if (unlikely(ecryptfs_verbosity > 0)) { -		ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; " -			"rc = [%d]\n", -			(unsigned long long)(extent_base + extent_offset), rc); -		ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " -				"encryption:\n"); -		ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8); -	}  out:  	return rc;  } @@ -543,17 +524,6 @@ static int ecryptfs_decrypt_extent(struct page *page,  			(unsigned long long)(extent_base + extent_offset), rc);  		goto out;  	} -	if (unlikely(ecryptfs_verbosity > 0)) { -		ecryptfs_printk(KERN_DEBUG, "Decrypting extent " -				"with iv:\n"); -		ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes); -		ecryptfs_printk(KERN_DEBUG, "First 8 bytes before " -				"decryption:\n"); -		ecryptfs_dump_hex((char *) -				  (page_address(enc_extent_page) -				   + (extent_offset * crypt_stat->extent_size)), -				  8); -	}  	rc = ecryptfs_decrypt_page_offset(crypt_stat, page,  					  (extent_offset  					   * crypt_stat->extent_size), @@ -567,16 +537,6 @@ static int ecryptfs_decrypt_extent(struct page *page,  		goto out;  	}  	rc = 0; -	if (unlikely(ecryptfs_verbosity > 0)) { -		ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; " -			"rc = [%d]\n", -			(unsigned long long)(extent_base + extent_offset), rc); -		ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " -				"decryption:\n"); -		ecryptfs_dump_hex((char *)(page_address(page) -					   + (extent_offset -					      * crypt_stat->extent_size)), 8); -	}  out:  	return rc;  } @@ -1590,8 +1550,8 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,   */  int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)  { -	int rc = 0; -	char *page_virt = NULL; +	int rc; +	char *page_virt;  	struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;  	struct ecryptfs_crypt_stat *crypt_stat =  	    &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; @@ -1616,11 +1576,13 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)  						ecryptfs_dentry,  						ECRYPTFS_VALIDATE_HEADER_SIZE);  	if (rc) { +		/* metadata is not in the file header, so try xattrs */  		memset(page_virt, 0, PAGE_CACHE_SIZE);  		rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);  		if (rc) {  			printk(KERN_DEBUG "Valid eCryptfs headers not found in " -			       "file header region or xattr region\n"); +			       "file header region or xattr region, inode %lu\n", +				ecryptfs_inode->i_ino);  			rc = -EINVAL;  			goto out;  		} @@ -1629,7 +1591,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)  						ECRYPTFS_DONT_VALIDATE_HEADER_SIZE);  		if (rc) {  			printk(KERN_DEBUG "Valid eCryptfs headers not found in " -			       "file xattr region either\n"); +			       "file xattr region either, inode %lu\n", +				ecryptfs_inode->i_ino);  			rc = -EINVAL;  		}  		if (crypt_stat->mount_crypt_stat->flags @@ -1640,7 +1603,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)  			       "crypto metadata only in the extended attribute "  			       "region, but eCryptfs was mounted without "  			       "xattr support enabled. eCryptfs will not treat " -			       "this like an encrypted file.\n"); +			       "this like an encrypted file, inode %lu\n", +				ecryptfs_inode->i_ino);  			rc = -EINVAL;  		}  	} @@ -2026,6 +1990,17 @@ out:  	return;  } +static size_t ecryptfs_max_decoded_size(size_t encoded_size) +{ +	/* Not exact; conservatively long. Every block of 4 +	 * encoded characters decodes into a block of 3 +	 * decoded characters. This segment of code provides +	 * the caller with the maximum amount of allocated +	 * space that @dst will need to point to in a +	 * subsequent call. */ +	return ((encoded_size + 1) * 3) / 4; +} +  /**   * ecryptfs_decode_from_filename   * @dst: If NULL, this function only sets @dst_size and returns. If @@ -2044,13 +2019,7 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,  	size_t dst_byte_offset = 0;  	if (dst == NULL) { -		/* Not exact; conservatively long. Every block of 4 -		 * encoded characters decodes into a block of 3 -		 * decoded characters. This segment of code provides -		 * the caller with the maximum amount of allocated -		 * space that @dst will need to point to in a -		 * subsequent call. */ -		(*dst_size) = (((src_size + 1) * 3) / 4); +		(*dst_size) = ecryptfs_max_decoded_size(src_size);  		goto out;  	}  	while (src_byte_offset < src_size) { @@ -2275,3 +2244,52 @@ out_free:  out:  	return rc;  } + +#define ENC_NAME_MAX_BLOCKLEN_8_OR_16	143 + +int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, +			   struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ +	struct blkcipher_desc desc; +	struct mutex *tfm_mutex; +	size_t cipher_blocksize; +	int rc; + +	if (!(mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) { +		(*namelen) = lower_namelen; +		return 0; +	} + +	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, +			mount_crypt_stat->global_default_fn_cipher_name); +	if (unlikely(rc)) { +		(*namelen) = 0; +		return rc; +	} + +	mutex_lock(tfm_mutex); +	cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm); +	mutex_unlock(tfm_mutex); + +	/* Return an exact amount for the common cases */ +	if (lower_namelen == NAME_MAX +	    && (cipher_blocksize == 8 || cipher_blocksize == 16)) { +		(*namelen) = ENC_NAME_MAX_BLOCKLEN_8_OR_16; +		return 0; +	} + +	/* Return a safe estimate for the uncommon cases */ +	(*namelen) = lower_namelen; +	(*namelen) -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; +	/* Since this is the max decoded size, subtract 1 "decoded block" len */ +	(*namelen) = ecryptfs_max_decoded_size(*namelen) - 3; +	(*namelen) -= ECRYPTFS_TAG_70_MAX_METADATA_SIZE; +	(*namelen) -= ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES; +	/* Worst case is that the filename is padded nearly a full block size */ +	(*namelen) -= cipher_blocksize - 1; + +	if ((*namelen) < 0) +		(*namelen) = 0; + +	return 0; +} diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index a9f29b12fbf..867b64c5d84 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -151,12 +151,21 @@ ecryptfs_get_key_payload_data(struct key *key)  					  * dentry name */  #define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as  					  * metadata */ +#define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */ +#define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to +				     * ecryptfs_parse_packet_length() and +				     * ecryptfs_write_packet_length() +				     */  /* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=   * ECRYPTFS_MAX_IV_BYTES */  #define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16  #define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */  #define MD5_DIGEST_SIZE 16  #define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE +#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \ +					   + ECRYPTFS_SIG_SIZE + 1 + 1) +#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \ +					   + ECRYPTFS_SIG_SIZE + 1 + 1)  #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."  #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23  #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED." @@ -696,6 +705,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,  			     size_t *packet_size,  			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,  			     char *data, size_t max_packet_size); +int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, +			   struct ecryptfs_mount_crypt_stat *mount_crypt_stat);  int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,  		       loff_t offset); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index d3f95f941c4..2b17f2f9b12 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -48,8 +48,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,  				unsigned long nr_segs, loff_t pos)  {  	ssize_t rc; -	struct dentry *lower_dentry; -	struct vfsmount *lower_vfsmount; +	struct path lower;  	struct file *file = iocb->ki_filp;  	rc = generic_file_aio_read(iocb, iov, nr_segs, pos); @@ -60,9 +59,9 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,  	if (-EIOCBQUEUED == rc)  		rc = wait_on_sync_kiocb(iocb);  	if (rc >= 0) { -		lower_dentry = ecryptfs_dentry_to_lower(file->f_path.dentry); -		lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry); -		touch_atime(lower_vfsmount, lower_dentry); +		lower.dentry = ecryptfs_dentry_to_lower(file->f_path.dentry); +		lower.mnt = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry); +		touch_atime(&lower);  	}  	return rc;  } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 19a8ca4ab1d..ab35b113003 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -822,18 +822,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,  		size_t num_zeros = (PAGE_CACHE_SIZE  				    - (ia->ia_size & ~PAGE_CACHE_MASK)); - -		/* -		 * XXX(truncate) this should really happen at the begginning -		 * of ->setattr.  But the code is too messy to that as part -		 * of a larger patch.  ecryptfs is also totally missing out -		 * on the inode_change_ok check at the beginning of -		 * ->setattr while would include this. -		 */ -		rc = inode_newsize_ok(inode, ia->ia_size); -		if (rc) -			goto out; -  		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {  			truncate_setsize(inode, ia->ia_size);  			lower_ia->ia_size = ia->ia_size; @@ -883,6 +871,28 @@ out:  	return rc;  } +static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset) +{ +	struct ecryptfs_crypt_stat *crypt_stat; +	loff_t lower_oldsize, lower_newsize; + +	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; +	lower_oldsize = upper_size_to_lower_size(crypt_stat, +						 i_size_read(inode)); +	lower_newsize = upper_size_to_lower_size(crypt_stat, offset); +	if (lower_newsize > lower_oldsize) { +		/* +		 * The eCryptfs inode and the new *lower* size are mixed here +		 * because we may not have the lower i_mutex held and/or it may +		 * not be appropriate to call inode_newsize_ok() with inodes +		 * from other filesystems. +		 */ +		return inode_newsize_ok(inode, lower_newsize); +	} + +	return 0; +} +  /**   * ecryptfs_truncate   * @dentry: The ecryptfs layer dentry @@ -899,6 +909,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)  	struct iattr lower_ia = { .ia_valid = 0 };  	int rc; +	rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length); +	if (rc) +		return rc; +  	rc = truncate_upper(dentry, &ia, &lower_ia);  	if (!rc && lower_ia.ia_valid & ATTR_SIZE) {  		struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); @@ -978,6 +992,16 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)  		}  	}  	mutex_unlock(&crypt_stat->cs_mutex); + +	rc = inode_change_ok(inode, ia); +	if (rc) +		goto out; +	if (ia->ia_valid & ATTR_SIZE) { +		rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size); +		if (rc) +			goto out; +	} +  	if (S_ISREG(inode->i_mode)) {  		rc = filemap_write_and_wait(inode->i_mapping);  		if (rc) @@ -1061,6 +1085,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,  	}  	rc = vfs_setxattr(lower_dentry, name, value, size, flags); +	if (!rc) +		fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);  out:  	return rc;  } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index ac1ad48c237..2333203a120 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -109,7 +109,7 @@ int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,  		(*size) += ((unsigned char)(data[1]) + 192);  		(*length_size) = 2;  	} else if (data[0] == 255) { -		/* Five-byte length; we're not supposed to see this */ +		/* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */  		ecryptfs_printk(KERN_ERR, "Five-byte packet length not "  				"supported\n");  		rc = -EINVAL; @@ -126,7 +126,7 @@ out:  /**   * ecryptfs_write_packet_length   * @dest: The byte array target into which to write the length. Must - *        have at least 5 bytes allocated. + *        have at least ECRYPTFS_MAX_PKT_LEN_SIZE bytes allocated.   * @size: The length to write.   * @packet_size_length: The number of bytes used to encode the packet   *                      length is written to this address. @@ -146,6 +146,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,  		dest[1] = ((size - 192) % 256);  		(*packet_size_length) = 2;  	} else { +		/* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */  		rc = -EINVAL;  		ecryptfs_printk(KERN_WARNING,  				"Unsupported packet size: [%zd]\n", size); @@ -678,10 +679,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,  	 * Octets N3-N4: Block-aligned encrypted filename  	 *  - Consists of a minimum number of random characters, a \0  	 *    separator, and then the filename */ -	s->max_packet_size = (1                   /* Tag 70 identifier */ -			      + 3                 /* Max Tag 70 packet size */ -			      + ECRYPTFS_SIG_SIZE /* FNEK sig */ -			      + 1                 /* Cipher identifier */ +	s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE  			      + s->block_aligned_filename_size);  	if (dest == NULL) {  		(*packet_size) = s->max_packet_size; @@ -933,10 +931,10 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,  		goto out;  	}  	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; -	if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) { +	if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {  		printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "  		       "at least [%d]\n", __func__, max_packet_size, -			(1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)); +		       ECRYPTFS_TAG_70_MIN_METADATA_SIZE);  		rc = -EINVAL;  		goto out;  	} diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index b4a6befb121..68954937a07 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -550,9 +550,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags  	if (IS_ERR(inode))  		goto out_free; -	s->s_root = d_alloc_root(inode); +	s->s_root = d_make_root(inode);  	if (!s->s_root) { -		iput(inode);  		rc = -ENOMEM;  		goto out_free;  	} @@ -795,15 +794,10 @@ static int __init ecryptfs_init(void)  		       "Failed to allocate one or more kmem_cache objects\n");  		goto out;  	} -	rc = register_filesystem(&ecryptfs_fs_type); -	if (rc) { -		printk(KERN_ERR "Failed to register filesystem\n"); -		goto out_free_kmem_caches; -	}  	rc = do_sysfs_registration();  	if (rc) {  		printk(KERN_ERR "sysfs registration failed\n"); -		goto out_unregister_filesystem; +		goto out_free_kmem_caches;  	}  	rc = ecryptfs_init_kthread();  	if (rc) { @@ -824,19 +818,24 @@ static int __init ecryptfs_init(void)  		       "rc = [%d]\n", rc);  		goto out_release_messaging;  	} +	rc = register_filesystem(&ecryptfs_fs_type); +	if (rc) { +		printk(KERN_ERR "Failed to register filesystem\n"); +		goto out_destroy_crypto; +	}  	if (ecryptfs_verbosity > 0)  		printk(KERN_CRIT "eCryptfs verbosity set to %d. Secret values "  			"will be written to the syslog!\n", ecryptfs_verbosity);  	goto out; +out_destroy_crypto: +	ecryptfs_destroy_crypto();  out_release_messaging:  	ecryptfs_release_messaging();  out_destroy_kthread:  	ecryptfs_destroy_kthread();  out_do_sysfs_unregistration:  	do_sysfs_unregistration(); -out_unregister_filesystem: -	unregister_filesystem(&ecryptfs_fs_type);  out_free_kmem_caches:  	ecryptfs_free_kmem_caches();  out: diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 940a82e63dc..3a06f4043df 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -218,6 +218,29 @@ out_unlock:  	return rc;  } +/* + * miscdevfs packet format: + *  Octet 0: Type + *  Octets 1-4: network byte order msg_ctx->counter + *  Octets 5-N0: Size of struct ecryptfs_message to follow + *  Octets N0-N1: struct ecryptfs_message (including data) + * + *  Octets 5-N1 not written if the packet type does not include a message + */ +#define PKT_TYPE_SIZE		1 +#define PKT_CTR_SIZE		4 +#define MIN_NON_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE) +#define MIN_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE \ +				 + ECRYPTFS_MIN_PKT_LEN_SIZE) +/* 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES comes from tag 65 packet format */ +#define MAX_MSG_PKT_SIZE	(PKT_TYPE_SIZE + PKT_CTR_SIZE \ +				 + ECRYPTFS_MAX_PKT_LEN_SIZE \ +				 + sizeof(struct ecryptfs_message) \ +				 + 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) +#define PKT_TYPE_OFFSET		0 +#define PKT_CTR_OFFSET		PKT_TYPE_SIZE +#define PKT_LEN_OFFSET		(PKT_TYPE_SIZE + PKT_CTR_SIZE) +  /**   * ecryptfs_miscdev_read - format and send message from queue   * @file: fs/ecryptfs/euid miscdevfs handle (ignored) @@ -237,7 +260,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,  	struct ecryptfs_daemon *daemon;  	struct ecryptfs_msg_ctx *msg_ctx;  	size_t packet_length_size; -	char packet_length[3]; +	char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE];  	size_t i;  	size_t total_length;  	uid_t euid = current_euid(); @@ -305,15 +328,8 @@ check_list:  		packet_length_size = 0;  		msg_ctx->msg_size = 0;  	} -	/* miscdevfs packet format: -	 *  Octet 0: Type -	 *  Octets 1-4: network byte order msg_ctx->counter -	 *  Octets 5-N0: Size of struct ecryptfs_message to follow -	 *  Octets N0-N1: struct ecryptfs_message (including data) -	 * -	 *  Octets 5-N1 not written if the packet type does not -	 *  include a message */ -	total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size); +	total_length = (PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_length_size +			+ msg_ctx->msg_size);  	if (count < total_length) {  		rc = 0;  		printk(KERN_WARNING "%s: Only given user buffer of " @@ -324,9 +340,10 @@ check_list:  	rc = -EFAULT;  	if (put_user(msg_ctx->type, buf))  		goto out_unlock_msg_ctx; -	if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1))) +	if (put_user(cpu_to_be32(msg_ctx->counter), +		     (__be32 __user *)(&buf[PKT_CTR_OFFSET])))  		goto out_unlock_msg_ctx; -	i = 5; +	i = PKT_TYPE_SIZE + PKT_CTR_SIZE;  	if (msg_ctx->msg) {  		if (copy_to_user(&buf[i], packet_length, packet_length_size))  			goto out_unlock_msg_ctx; @@ -391,12 +408,6 @@ out:   * @count: Amount of data in @buf   * @ppos: Pointer to offset in file (ignored)   * - * miscdevfs packet format: - *  Octet 0: Type - *  Octets 1-4: network byte order msg_ctx->counter (0's for non-response) - *  Octets 5-N0: Size of struct ecryptfs_message to follow - *  Octets N0-N1: struct ecryptfs_message (including data) - *   * Returns the number of bytes read from @buf   */  static ssize_t @@ -405,60 +416,78 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,  {  	__be32 counter_nbo;  	u32 seq; -	size_t packet_size, packet_size_length, i; -	ssize_t sz = 0; +	size_t packet_size, packet_size_length;  	char *data;  	uid_t euid = current_euid(); -	int rc; +	unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE]; +	ssize_t rc; -	if (count == 0) -		goto out; +	if (count == 0) { +		return 0; +	} else if (count == MIN_NON_MSG_PKT_SIZE) { +		/* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */ +		goto memdup; +	} else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) { +		printk(KERN_WARNING "%s: Acceptable packet size range is " +		       "[%d-%zu], but amount of data written is [%zu].", +		       __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count); +		return -EINVAL; +	} + +	if (copy_from_user(packet_size_peek, &buf[PKT_LEN_OFFSET], +			   sizeof(packet_size_peek))) { +		printk(KERN_WARNING "%s: Error while inspecting packet size\n", +		       __func__); +		return -EFAULT; +	} +	rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size, +					  &packet_size_length); +	if (rc) { +		printk(KERN_WARNING "%s: Error parsing packet length; " +		       "rc = [%zd]\n", __func__, rc); +		return rc; +	} + +	if ((PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_size_length + packet_size) +	    != count) { +		printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__, +		       packet_size); +		return -EINVAL; +	} + +memdup:  	data = memdup_user(buf, count);  	if (IS_ERR(data)) {  		printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",  		       __func__, PTR_ERR(data)); -		goto out; +		return PTR_ERR(data);  	} -	sz = count; -	i = 0; -	switch (data[i++]) { +	switch (data[PKT_TYPE_OFFSET]) {  	case ECRYPTFS_MSG_RESPONSE: -		if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { +		if (count < (MIN_MSG_PKT_SIZE +			     + sizeof(struct ecryptfs_message))) {  			printk(KERN_WARNING "%s: Minimum acceptable packet "  			       "size is [%zd], but amount of data written is "  			       "only [%zd]. Discarding response packet.\n",  			       __func__, -			       (1 + 4 + 1 + sizeof(struct ecryptfs_message)), -			       count); +			       (MIN_MSG_PKT_SIZE +				+ sizeof(struct ecryptfs_message)), count); +			rc = -EINVAL;  			goto out_free;  		} -		memcpy(&counter_nbo, &data[i], 4); +		memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE);  		seq = be32_to_cpu(counter_nbo); -		i += 4; -		rc = ecryptfs_parse_packet_length(&data[i], &packet_size, -						  &packet_size_length); +		rc = ecryptfs_miscdev_response( +				&data[PKT_LEN_OFFSET + packet_size_length], +				packet_size, euid, current_user_ns(), +				task_pid(current), seq);  		if (rc) { -			printk(KERN_WARNING "%s: Error parsing packet length; " -			       "rc = [%d]\n", __func__, rc); -			goto out_free; -		} -		i += packet_size_length; -		if ((1 + 4 + packet_size_length + packet_size) != count) { -			printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])" -			       " + packet_size([%zd]))([%zd]) != " -			       "count([%zd]). Invalid packet format.\n", -			       __func__, packet_size_length, packet_size, -			       (1 + packet_size_length + packet_size), count); -			goto out_free; -		} -		rc = ecryptfs_miscdev_response(&data[i], packet_size, -					       euid, current_user_ns(), -					       task_pid(current), seq); -		if (rc)  			printk(KERN_WARNING "%s: Failed to deliver miscdev " -			       "response to requesting operation; rc = [%d]\n", +			       "response to requesting operation; rc = [%zd]\n",  			       __func__, rc); +			goto out_free; +		}  		break;  	case ECRYPTFS_MSG_HELO:  	case ECRYPTFS_MSG_QUIT: @@ -467,12 +496,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,  		ecryptfs_printk(KERN_WARNING, "Dropping miscdev "  				"message of unrecognized type [%d]\n",  				data[0]); -		break; +		rc = -EINVAL; +		goto out_free;  	} +	rc = count;  out_free:  	kfree(data); -out: -	return sz; +	return rc;  } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 6a44148c5fb..a46b3a8fee1 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -57,6 +57,10 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)   * @page: Page that is locked before this call is made   *   * Returns zero on success; non-zero otherwise + * + * This is where we encrypt the data and pass the encrypted data to + * the lower filesystem.  In OpenPGP-compatible mode, we operate on + * entire underlying packets.   */  static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)  { @@ -146,7 +150,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,  			/* This is a header extent */  			char *page_virt; -			page_virt = kmap_atomic(page, KM_USER0); +			page_virt = kmap_atomic(page);  			memset(page_virt, 0, PAGE_CACHE_SIZE);  			/* TODO: Support more than one header extent */  			if (view_extent_num == 0) { @@ -159,7 +163,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,  							       crypt_stat,  							       &written);  			} -			kunmap_atomic(page_virt, KM_USER0); +			kunmap_atomic(page_virt);  			flush_dcache_page(page);  			if (rc) {  				printk(KERN_ERR "%s: Error reading xattr " @@ -481,10 +485,6 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)   * @copied: The amount of data copied   * @page: The eCryptfs page   * @fsdata: The fsdata (unused) - * - * This is where we encrypt the data and pass the encrypted data to - * the lower filesystem.  In OpenPGP-compatible mode, we operate on - * entire underlying packets.   */  static int ecryptfs_write_end(struct file *file,  			struct address_space *mapping, diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 3745f7c2b9c..b2a34a192f4 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -130,13 +130,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,  		pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);  		size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);  		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page); -		size_t total_remaining_bytes = ((offset + size) - pos); +		loff_t total_remaining_bytes = ((offset + size) - pos); + +		if (fatal_signal_pending(current)) { +			rc = -EINTR; +			break; +		}  		if (num_bytes > total_remaining_bytes)  			num_bytes = total_remaining_bytes;  		if (pos < offset) {  			/* remaining zeros to write, up to destination offset */ -			size_t total_remaining_zeros = (offset - pos); +			loff_t total_remaining_zeros = (offset - pos);  			if (num_bytes > total_remaining_zeros)  				num_bytes = total_remaining_zeros; @@ -151,7 +156,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,  			       ecryptfs_page_idx, rc);  			goto out;  		} -		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0); +		ecryptfs_page_virt = kmap_atomic(ecryptfs_page);  		/*  		 * pos: where we're now writing, offset: where the request was @@ -174,7 +179,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,  			       (data + data_offset), num_bytes);  			data_offset += num_bytes;  		} -		kunmap_atomic(ecryptfs_page_virt, KM_USER0); +		kunmap_atomic(ecryptfs_page_virt);  		flush_dcache_page(ecryptfs_page);  		SetPageUptodate(ecryptfs_page);  		unlock_page(ecryptfs_page); @@ -193,15 +198,19 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,  		}  		pos += num_bytes;  	} -	if ((offset + size) > ecryptfs_file_size) { -		i_size_write(ecryptfs_inode, (offset + size)); +	if (pos > ecryptfs_file_size) { +		i_size_write(ecryptfs_inode, pos);  		if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) { -			rc = ecryptfs_write_inode_size_to_metadata( +			int rc2; + +			rc2 = ecryptfs_write_inode_size_to_metadata(  								ecryptfs_inode); -			if (rc) { +			if (rc2) {  				printk(KERN_ERR	"Problem with "  				       "ecryptfs_write_inode_size_to_metadata; " -				       "rc = [%d]\n", rc); +				       "rc = [%d]\n", rc2); +				if (!rc) +					rc = rc2;  				goto out;  			}  		} @@ -273,76 +282,3 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,  	flush_dcache_page(page_for_ecryptfs);  	return rc;  } - -#if 0 -/** - * ecryptfs_read - * @data: The virtual address into which to write the data read (and - *        possibly decrypted) from the lower file - * @offset: The offset in the decrypted view of the file from which to - *          read into @data - * @size: The number of bytes to read into @data - * @ecryptfs_file: The eCryptfs file from which to read - * - * Read an arbitrary amount of data from an arbitrary location in the - * eCryptfs page cache. This is done on an extent-by-extent basis; - * individual extents are decrypted and read from the lower page - * cache (via VFS reads). This function takes care of all the - * address translation to locations in the lower filesystem. - * - * Returns zero on success; non-zero otherwise - */ -int ecryptfs_read(char *data, loff_t offset, size_t size, -		  struct file *ecryptfs_file) -{ -	struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode; -	struct page *ecryptfs_page; -	char *ecryptfs_page_virt; -	loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); -	loff_t data_offset = 0; -	loff_t pos; -	int rc = 0; - -	if ((offset + size) > ecryptfs_file_size) { -		rc = -EINVAL; -		printk(KERN_ERR "%s: Attempt to read data past the end of the " -			"file; offset = [%lld]; size = [%td]; " -		       "ecryptfs_file_size = [%lld]\n", -		       __func__, offset, size, ecryptfs_file_size); -		goto out; -	} -	pos = offset; -	while (pos < (offset + size)) { -		pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT); -		size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK); -		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page); -		size_t total_remaining_bytes = ((offset + size) - pos); - -		if (num_bytes > total_remaining_bytes) -			num_bytes = total_remaining_bytes; -		ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, -							 ecryptfs_page_idx); -		if (IS_ERR(ecryptfs_page)) { -			rc = PTR_ERR(ecryptfs_page); -			printk(KERN_ERR "%s: Error getting page at " -			       "index [%ld] from eCryptfs inode " -			       "mapping; rc = [%d]\n", __func__, -			       ecryptfs_page_idx, rc); -			goto out; -		} -		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0); -		memcpy((data + data_offset), -		       ((char *)ecryptfs_page_virt + start_offset_in_page), -		       num_bytes); -		kunmap_atomic(ecryptfs_page_virt, KM_USER0); -		flush_dcache_page(ecryptfs_page); -		SetPageUptodate(ecryptfs_page); -		unlock_page(ecryptfs_page); -		page_cache_release(ecryptfs_page); -		pos += num_bytes; -		data_offset += num_bytes; -	} -out: -	return rc; -} -#endif  /*  0  */ diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 9df7fd6e0c3..2dd946b636d 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -30,6 +30,8 @@  #include <linux/seq_file.h>  #include <linux/file.h>  #include <linux/crypto.h> +#include <linux/statfs.h> +#include <linux/magic.h>  #include "ecryptfs_kernel.h"  struct kmem_cache *ecryptfs_inode_info_cache; @@ -102,10 +104,20 @@ static void ecryptfs_destroy_inode(struct inode *inode)  static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)  {  	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); +	int rc;  	if (!lower_dentry->d_sb->s_op->statfs)  		return -ENOSYS; -	return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf); + +	rc = lower_dentry->d_sb->s_op->statfs(lower_dentry, buf); +	if (rc) +		return rc; + +	buf->f_type = ECRYPTFS_SUPER_MAGIC; +	rc = ecryptfs_set_f_namelen(&buf->f_namelen, buf->f_namelen, +	       &ecryptfs_superblock_to_private(dentry->d_sb)->mount_crypt_stat); + +	return rc;  }  /** @@ -172,7 +184,6 @@ static int ecryptfs_show_options(struct seq_file *m, struct dentry *root)  const struct super_operations ecryptfs_sops = {  	.alloc_inode = ecryptfs_alloc_inode,  	.destroy_inode = ecryptfs_destroy_inode, -	.drop_inode = generic_drop_inode,  	.statfs = ecryptfs_statfs,  	.remount_fs = NULL,  	.evict_inode = ecryptfs_evict_inode, diff --git a/fs/efs/super.c b/fs/efs/super.c index 981106429a9..e755ec746c6 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -317,10 +317,9 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)  		goto out_no_fs;  	} -	s->s_root = d_alloc_root(root); +	s->s_root = d_make_root(root);  	if (!(s->s_root)) {  		printk(KERN_ERR "EFS: get root dentry failed\n"); -		iput(root);  		ret = -ENOMEM;  		goto out_no_fs;  	} diff --git a/fs/eventfd.c b/fs/eventfd.c index d9a59177391..dba15fecf23 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -16,7 +16,7 @@  #include <linux/spinlock.h>  #include <linux/anon_inodes.h>  #include <linux/syscalls.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/kref.h>  #include <linux/eventfd.h> diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aabdfc38cf2..629e9ed99d0 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -320,6 +320,11 @@ static inline int ep_is_linked(struct list_head *p)  	return !list_empty(p);  } +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +{ +	return container_of(p, struct eppoll_entry, wait); +} +  /* Get the "struct epitem" from a wait queue pointer */  static inline struct epitem *ep_item_from_wait(wait_queue_t *p)  { @@ -422,6 +427,31 @@ out_unlock:  	return error;  } +/* + * As described in commit 0ccf831cb lockdep: annotate epoll + * the use of wait queues used by epoll is done in a very controlled + * manner. Wake ups can nest inside each other, but are never done + * with the same locking. For example: + * + *   dfd = socket(...); + *   efd1 = epoll_create(); + *   efd2 = epoll_create(); + *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); + *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); + * + * When a packet arrives to the device underneath "dfd", the net code will + * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a + * callback wakeup entry on that queue, and the wake_up() performed by the + * "dfd" net code will end up in ep_poll_callback(). At this point epoll + * (efd1) notices that it may have some event ready, so it needs to wake up + * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() + * that ends up in another wake_up(), after having checked about the + * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to + * avoid stack blasting. + * + * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle + * this special case of epoll. + */  #ifdef CONFIG_DEBUG_LOCK_ALLOC  static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,  				     unsigned long events, int subclass) @@ -467,6 +497,18 @@ static void ep_poll_safewake(wait_queue_head_t *wq)  	put_cpu();  } +static void ep_remove_wait_queue(struct eppoll_entry *pwq) +{ +	wait_queue_head_t *whead; + +	rcu_read_lock(); +	/* If it is cleared by POLLFREE, it should be rcu-safe */ +	whead = rcu_dereference(pwq->whead); +	if (whead) +		remove_wait_queue(whead, &pwq->wait); +	rcu_read_unlock(); +} +  /*   * This function unregisters poll callbacks from the associated file   * descriptor.  Must be called with "mtx" held (or "epmutex" if called from @@ -481,7 +523,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)  		pwq = list_first_entry(lsthead, struct eppoll_entry, llink);  		list_del(&pwq->llink); -		remove_wait_queue(pwq->whead, &pwq->wait); +		ep_remove_wait_queue(pwq);  		kmem_cache_free(pwq_cache, pwq);  	}  } @@ -682,9 +724,12 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,  			       void *priv)  {  	struct epitem *epi, *tmp; +	poll_table pt; +	init_poll_funcptr(&pt, NULL);  	list_for_each_entry_safe(epi, tmp, head, rdllink) { -		if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & +		pt._key = epi->event.events; +		if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &  		    epi->event.events)  			return POLLIN | POLLRDNORM;  		else { @@ -842,6 +887,17 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k  	struct epitem *epi = ep_item_from_wait(wait);  	struct eventpoll *ep = epi->ep; +	if ((unsigned long)key & POLLFREE) { +		ep_pwq_from_wait(wait)->whead = NULL; +		/* +		 * whead = NULL above can race with ep_remove_wait_queue() +		 * which can do another remove_wait_queue() after us, so we +		 * can't use __remove_wait_queue(). whead->lock is held by +		 * the caller. +		 */ +		list_del_init(&wait->task_list); +	} +  	spin_lock_irqsave(&ep->lock, flags);  	/* @@ -960,6 +1016,10 @@ static int path_count[PATH_ARR_SIZE];  static int path_count_inc(int nests)  { +	/* Allow an arbitrary number of depth 1 paths */ +	if (nests == 0) +		return 0; +  	if (++path_count[nests] > path_limits[nests])  		return -1;  	return 0; @@ -1017,13 +1077,11 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)   */  static int reverse_path_check(void)  { -	int length = 0;  	int error = 0;  	struct file *current_file;  	/* let's call this for all tfiles */  	list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { -		length++;  		path_count_init();  		error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,  					reverse_path_check_proc, current_file, @@ -1065,6 +1123,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,  	/* Initialize the poll table using the queue callback */  	epq.epi = epi;  	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); +	epq.pt._key = event->events;  	/*  	 * Attach the item to the poll hooks and get current event bits. @@ -1159,6 +1218,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even  {  	int pwake = 0;  	unsigned int revents; +	poll_table pt; + +	init_poll_funcptr(&pt, NULL);  	/*  	 * Set the new event interest mask before calling f_op->poll(); @@ -1166,13 +1228,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even  	 * f_op->poll() call and the new event set registering.  	 */  	epi->event.events = event->events; +	pt._key = event->events;  	epi->event.data = event->data; /* protected by mtx */  	/*  	 * Get current event bits. We can safely use the file* here because  	 * its usage count has been increased by the caller of this function.  	 */ -	revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); +	revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);  	/*  	 * If the item is "hot" and it is not registered inside the ready @@ -1207,6 +1270,9 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,  	unsigned int revents;  	struct epitem *epi;  	struct epoll_event __user *uevent; +	poll_table pt; + +	init_poll_funcptr(&pt, NULL);  	/*  	 * We can loop without lock because we are passed a task private list. @@ -1219,7 +1285,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,  		list_del_init(&epi->rdllink); -		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & +		pt._key = epi->event.events; +		revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &  			epi->event.events;  		/* diff --git a/fs/exec.c b/fs/exec.c index aeb135c7ff5..23559c227d9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -63,6 +63,8 @@  #include <trace/events/task.h>  #include "internal.h" +#include <trace/events/sched.h> +  int core_uses_pid;  char core_pattern[CORENAME_MAX_SIZE] = "core";  unsigned int core_pipe_limit; @@ -79,15 +81,13 @@ static atomic_t call_count = ATOMIC_INIT(1);  static LIST_HEAD(formats);  static DEFINE_RWLOCK(binfmt_lock); -int __register_binfmt(struct linux_binfmt * fmt, int insert) +void __register_binfmt(struct linux_binfmt * fmt, int insert)  { -	if (!fmt) -		return -EINVAL; +	BUG_ON(!fmt);  	write_lock(&binfmt_lock);  	insert ? list_add(&fmt->lh, &formats) :  		 list_add_tail(&fmt->lh, &formats);  	write_unlock(&binfmt_lock); -	return 0;	  }  EXPORT_SYMBOL(__register_binfmt); @@ -822,7 +822,7 @@ static int exec_mmap(struct mm_struct *mm)  	/* Notify parent that we're no longer interested in the old VM */  	tsk = current;  	old_mm = current->mm; -	sync_mm_rss(tsk, old_mm); +	sync_mm_rss(old_mm);  	mm_release(tsk, old_mm);  	if (old_mm) { @@ -848,6 +848,7 @@ static int exec_mmap(struct mm_struct *mm)  	if (old_mm) {  		up_read(&old_mm->mmap_sem);  		BUG_ON(active_mm != old_mm); +		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);  		mm_update_next_owner(old_mm);  		mmput(old_mm);  		return 0; @@ -975,8 +976,8 @@ static int de_thread(struct task_struct *tsk)  	sig->notify_count = 0;  no_thread_group: -	if (current->mm) -		setmax_mm_hiwater_rss(&sig->maxrss, current->mm); +	/* we have changed execution domain */ +	tsk->exit_signal = SIGCHLD;  	exit_itimers(sig);  	flush_itimer_signals(); @@ -1071,6 +1072,21 @@ void set_task_comm(struct task_struct *tsk, char *buf)  	perf_event_comm(tsk);  } +static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len) +{ +	int i, ch; + +	/* Copies the binary name from after last slash */ +	for (i = 0; (ch = *(fn++)) != '\0';) { +		if (ch == '/') +			i = 0; /* overwrite what we wrote */ +		else +			if (i < len - 1) +				tcomm[i++] = ch; +	} +	tcomm[i] = '\0'; +} +  int flush_old_exec(struct linux_binprm * bprm)  {  	int retval; @@ -1085,6 +1101,7 @@ int flush_old_exec(struct linux_binprm * bprm)  	set_mm_exe_file(bprm->mm, bprm->file); +	filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));  	/*  	 * Release all of the old mmap stuff  	 */ @@ -1096,7 +1113,7 @@ int flush_old_exec(struct linux_binprm * bprm)  	bprm->mm = NULL;		/* We're using it now */  	set_fs(USER_DS); -	current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD); +	current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD);  	flush_thread();  	current->personality &= ~bprm->per_clear; @@ -1116,10 +1133,6 @@ EXPORT_SYMBOL(would_dump);  void setup_new_exec(struct linux_binprm * bprm)  { -	int i, ch; -	const char *name; -	char tcomm[sizeof(current->comm)]; -  	arch_pick_mmap_layout(current->mm);  	/* This is the point of no return */ @@ -1130,18 +1143,7 @@ void setup_new_exec(struct linux_binprm * bprm)  	else  		set_dumpable(current->mm, suid_dumpable); -	name = bprm->filename; - -	/* Copies the binary name from after last slash */ -	for (i=0; (ch = *(name++)) != '\0';) { -		if (ch == '/') -			i = 0; /* overwrite what we wrote */ -		else -			if (i < (sizeof(tcomm) - 1)) -				tcomm[i++] = ch; -	} -	tcomm[i] = '\0'; -	set_task_comm(current, tcomm); +	set_task_comm(current, bprm->tcomm);  	/* Set the new mm task size. We have to do that late because it may  	 * depend on TIF_32BIT which is only updated in flush_thread() on @@ -1338,13 +1340,13 @@ int remove_arg_zero(struct linux_binprm *bprm)  			ret = -EFAULT;  			goto out;  		} -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = kmap_atomic(page);  		for (; offset < PAGE_SIZE && kaddr[offset];  				offset++, bprm->p++)  			; -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		put_arg_page(page);  		if (offset == PAGE_SIZE) @@ -1401,9 +1403,10 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)  			 */  			bprm->recursion_depth = depth;  			if (retval >= 0) { -				if (depth == 0) -					ptrace_event(PTRACE_EVENT_EXEC, -							old_pid); +				if (depth == 0) { +					trace_sched_process_exec(current, old_pid, bprm); +					ptrace_event(PTRACE_EVENT_EXEC, old_pid); +				}  				put_binfmt(fmt);  				allow_write_access(bprm->file);  				if (bprm->file) @@ -1914,7 +1917,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state)  {  	struct task_struct *tsk = current;  	struct mm_struct *mm = tsk->mm; -	struct completion *vfork_done;  	int core_waiters = -EBUSY;  	init_completion(&core_state->startup); @@ -1926,22 +1928,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state)  		core_waiters = zap_threads(tsk, mm, core_state, exit_code);  	up_write(&mm->mmap_sem); -	if (unlikely(core_waiters < 0)) -		goto fail; - -	/* -	 * Make sure nobody is waiting for us to release the VM, -	 * otherwise we can deadlock when we wait on each other -	 */ -	vfork_done = tsk->vfork_done; -	if (vfork_done) { -		tsk->vfork_done = NULL; -		complete(vfork_done); -	} - -	if (core_waiters) +	if (core_waiters > 0)  		wait_for_completion(&core_state->startup); -fail: +  	return core_waiters;  } diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 80405836ba6..c61e62ac231 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -597,7 +597,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)  		goto fail;  	} -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	de = (struct exofs_dir_entry *)kaddr;  	de->name_len = 1;  	de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1)); @@ -611,7 +611,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)  	de->inode_no = cpu_to_le64(parent->i_ino);  	memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));  	exofs_set_de_type(de, inode); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	err = exofs_commit_chunk(page, 0, chunk_size);  fail:  	page_cache_release(page); diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 9dbf0c30103..fc7161d6bf6 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -143,9 +143,6 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,  {  	struct inode *inode = old_dentry->d_inode; -	if (inode->i_nlink >= EXOFS_LINK_MAX) -		return -EMLINK; -  	inode->i_ctime = CURRENT_TIME;  	inode_inc_link_count(inode);  	ihold(inode); @@ -156,10 +153,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir,  static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  {  	struct inode *inode; -	int err = -EMLINK; - -	if (dir->i_nlink >= EXOFS_LINK_MAX) -		goto out; +	int err;  	inode_inc_link_count(dir); @@ -275,11 +269,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,  		if (err)  			goto out_dir;  	} else { -		if (dir_de) { -			err = -EMLINK; -			if (new_dir->i_nlink >= EXOFS_LINK_MAX) -				goto out_dir; -		}  		err = exofs_add_link(new_dentry, old_inode);  		if (err)  			goto out_dir; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index d22cd168c6e..7f2b590a36b 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -754,6 +754,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)  	sb->s_blocksize = EXOFS_BLKSIZE;  	sb->s_blocksize_bits = EXOFS_BLKSHIFT;  	sb->s_maxbytes = MAX_LFS_FILESIZE; +	sb->s_max_links = EXOFS_LINK_MAX;  	atomic_set(&sbi->s_curr_pending, 0);  	sb->s_bdev = NULL;  	sb->s_dev = 0; @@ -818,9 +819,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)  		ret = PTR_ERR(root);  		goto free_sbi;  	} -	sb->s_root = d_alloc_root(root); +	sb->s_root = d_make_root(root);  	if (!sb->s_root) { -		iput(root);  		EXOFS_ERR("ERROR: get root inode failed\n");  		ret = -ENOMEM;  		goto free_sbi; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index d37df352d32..0f4f5c92925 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -645,7 +645,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)  		unlock_page(page);  		goto fail;  	} -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	memset(kaddr, 0, chunk_size);  	de = (struct ext2_dir_entry_2 *)kaddr;  	de->name_len = 1; @@ -660,7 +660,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)  	de->inode = cpu_to_le32(parent->i_ino);  	memcpy (de->name, "..\0", 4);  	ext2_set_de_type (de, inode); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	err = ext2_commit_chunk(page, 0, chunk_size);  fail:  	page_cache_release(page); diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 1089f760c84..2de655f5d62 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -77,10 +77,11 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  		flags = flags & EXT2_FL_USER_MODIFIABLE;  		flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;  		ei->i_flags = flags; -		mutex_unlock(&inode->i_mutex);  		ext2_set_inode_flags(inode);  		inode->i_ctime = CURRENT_TIME_SEC; +		mutex_unlock(&inode->i_mutex); +  		mark_inode_dirty(inode);  setflags_out:  		mnt_drop_write_file(filp); @@ -88,20 +89,29 @@ setflags_out:  	}  	case EXT2_IOC_GETVERSION:  		return put_user(inode->i_generation, (int __user *) arg); -	case EXT2_IOC_SETVERSION: +	case EXT2_IOC_SETVERSION: { +		__u32 generation; +  		if (!inode_owner_or_capable(inode))  			return -EPERM;  		ret = mnt_want_write_file(filp);  		if (ret)  			return ret; -		if (get_user(inode->i_generation, (int __user *) arg)) { +		if (get_user(generation, (int __user *) arg)) {  			ret = -EFAULT; -		} else { -			inode->i_ctime = CURRENT_TIME_SEC; -			mark_inode_dirty(inode); +			goto setversion_out;  		} + +		mutex_lock(&inode->i_mutex); +		inode->i_ctime = CURRENT_TIME_SEC; +		inode->i_generation = generation; +		mutex_unlock(&inode->i_mutex); + +		mark_inode_dirty(inode); +setversion_out:  		mnt_drop_write_file(filp);  		return ret; +	}  	case EXT2_IOC_GETRSVSZ:  		if (test_opt(inode->i_sb, RESERVATION)  			&& S_ISREG(inode->i_mode) diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 080419814ba..dffb8653628 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -195,9 +195,6 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,  	struct inode *inode = old_dentry->d_inode;  	int err; -	if (inode->i_nlink >= EXT2_LINK_MAX) -		return -EMLINK; -  	dquot_initialize(dir);  	inode->i_ctime = CURRENT_TIME_SEC; @@ -217,10 +214,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,  static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)  {  	struct inode * inode; -	int err = -EMLINK; - -	if (dir->i_nlink >= EXT2_LINK_MAX) -		goto out; +	int err;  	dquot_initialize(dir); @@ -346,11 +340,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,  			drop_nlink(new_inode);  		inode_dec_link_count(new_inode);  	} else { -		if (dir_de) { -			err = -EMLINK; -			if (new_dir->i_nlink >= EXT2_LINK_MAX) -				goto out_dir; -		}  		err = ext2_add_link(new_dentry, old_inode);  		if (err)  			goto out_dir; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 0090595beb2..e1025c7a437 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -919,6 +919,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)  	}  	sb->s_maxbytes = ext2_max_size(sb->s_blocksize_bits); +	sb->s_max_links = EXT2_LINK_MAX;  	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV) {  		sbi->s_inode_size = EXT2_GOOD_OLD_INODE_SIZE; @@ -1087,9 +1088,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)  		goto failed_mount3;  	} -	sb->s_root = d_alloc_root(root); +	sb->s_root = d_make_root(root);  	if (!sb->s_root) { -		iput(root);  		ext2_msg(sb, KERN_ERR, "error: get root inode failed");  		ret = -ENOMEM;  		goto failed_mount3; diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index a2038928f9a..1e036b79384 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1743,8 +1743,11 @@ allocated:  	*errp = 0;  	brelse(bitmap_bh); -	dquot_free_block(inode, *count-num); -	*count = num; + +	if (num < *count) { +		dquot_free_block(inode, *count-num); +		*count = num; +	}  	trace_ext3_allocate_blocks(inode, goal, num,  				   (unsigned long long)ret_block); @@ -1970,7 +1973,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,  	sbi = EXT3_SB(sb);  	 /* Walk through the whole group */ -	while (start < max) { +	while (start <= max) {  		start = bitmap_search_next_usable_block(start, bitmap_bh, max);  		if (start < 0)  			break; @@ -1980,7 +1983,7 @@ static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,  		 * Allocate contiguous free extents by setting bits in the  		 * block bitmap  		 */ -		while (next < max +		while (next <= max  			&& claim_block(sb_bgl_lock(sbi, group),  					next, bitmap_bh)) {  			next++; @@ -2091,73 +2094,74 @@ err_out:   */  int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)  { -	ext3_grpblk_t last_block, first_block, free_blocks; -	unsigned long first_group, last_group; -	unsigned long group, ngroups; +	ext3_grpblk_t last_block, first_block; +	unsigned long group, first_group, last_group;  	struct ext3_group_desc *gdp;  	struct ext3_super_block *es = EXT3_SB(sb)->s_es; -	uint64_t start, len, minlen, trimmed; +	uint64_t start, minlen, end, trimmed = 0; +	ext3_fsblk_t first_data_blk = +			le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);  	ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);  	int ret = 0; -	start = (range->start >> sb->s_blocksize_bits) + -		le32_to_cpu(es->s_first_data_block); -	len = range->len >> sb->s_blocksize_bits; +	start = range->start >> sb->s_blocksize_bits; +	end = start + (range->len >> sb->s_blocksize_bits) - 1;  	minlen = range->minlen >> sb->s_blocksize_bits; -	trimmed = 0; -	if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) +	if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) || +	    unlikely(start >= max_blks))  		return -EINVAL; -	if (start >= max_blks) -		return -EINVAL; -	if (start + len > max_blks) -		len = max_blks - start; +	if (end >= max_blks) +		end = max_blks - 1; +	if (end <= first_data_blk) +		goto out; +	if (start < first_data_blk) +		start = first_data_blk; -	ngroups = EXT3_SB(sb)->s_groups_count;  	smp_rmb();  	/* Determine first and last group to examine based on start and len */  	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,  				     &first_group, &first_block); -	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len), +	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end,  				     &last_group, &last_block); -	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; -	last_block = EXT3_BLOCKS_PER_GROUP(sb); -	if (first_group > last_group) -		return -EINVAL; +	/* end now represents the last block to discard in this group */ +	end = EXT3_BLOCKS_PER_GROUP(sb) - 1;  	for (group = first_group; group <= last_group; group++) {  		gdp = ext3_get_group_desc(sb, group, NULL);  		if (!gdp)  			break; -		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); -		if (free_blocks < minlen) -			continue; -  		/*  		 * For all the groups except the last one, last block will -		 * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to -		 * change it for the last group in which case first_block + -		 * len < EXT3_BLOCKS_PER_GROUP(sb). +		 * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to +		 * change it for the last group, note that last_block is +		 * already computed earlier by ext3_get_group_no_and_offset()  		 */ -		if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb)) -			last_block = first_block + len; -		len -= last_block - first_block; +		if (group == last_group) +			end = last_block; -		ret = ext3_trim_all_free(sb, group, first_block, -					last_block, minlen); -		if (ret < 0) -			break; +		if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) { +			ret = ext3_trim_all_free(sb, group, first_block, +						 end, minlen); +			if (ret < 0) +				break; +			trimmed += ret; +		} -		trimmed += ret; +		/* +		 * For every group except the first one, we are sure +		 * that the first block to discard will be block #0. +		 */  		first_block = 0;  	} -	if (ret >= 0) +	if (ret > 0)  		ret = 0; -	range->len = trimmed * sb->s_blocksize; +out: +	range->len = trimmed * sb->s_blocksize;  	return ret;  } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2d0afeca0b4..6d3418662b5 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -756,6 +756,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,  	struct ext3_block_alloc_info *block_i;  	ext3_fsblk_t current_block;  	struct ext3_inode_info *ei = EXT3_I(inode); +	struct timespec now;  	block_i = ei->i_block_alloc_info;  	/* @@ -795,9 +796,11 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,  	}  	/* We are done with atomic stuff, now do the rest of housekeeping */ - -	inode->i_ctime = CURRENT_TIME_SEC; -	ext3_mark_inode_dirty(handle, inode); +	now = CURRENT_TIME_SEC; +	if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { +		inode->i_ctime = now; +		ext3_mark_inode_dirty(handle, inode); +	}  	/* ext3_mark_inode_dirty already updated i_sync_tid */  	atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 726c7ef6cdf..e0b45b93327 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2046,10 +2046,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)  		ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");  		goto failed_mount3;  	} -	sb->s_root = d_alloc_root(root); +	sb->s_root = d_make_root(root);  	if (!sb->s_root) {  		ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); -		iput(root);  		ret = -ENOMEM;  		goto failed_mount3;  	} diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f9e2cd8cf71..4bbd07a6fa1 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -336,10 +336,10 @@ err_out:   * Return buffer_head on success or NULL in case of failure.   */  struct buffer_head * -ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)  {  	struct ext4_group_desc *desc; -	struct buffer_head *bh = NULL; +	struct buffer_head *bh;  	ext4_fsblk_t bitmap_blk;  	desc = ext4_get_group_desc(sb, block_group, NULL); @@ -348,9 +348,9 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)  	bitmap_blk = ext4_block_bitmap(sb, desc);  	bh = sb_getblk(sb, bitmap_blk);  	if (unlikely(!bh)) { -		ext4_error(sb, "Cannot read block bitmap - " -			    "block_group = %u, block_bitmap = %llu", -			    block_group, bitmap_blk); +		ext4_error(sb, "Cannot get buffer for block bitmap - " +			   "block_group = %u, block_bitmap = %llu", +			   block_group, bitmap_blk);  		return NULL;  	} @@ -382,25 +382,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)  		return bh;  	}  	/* -	 * submit the buffer_head for read. We can -	 * safely mark the bitmap as uptodate now. -	 * We do it here so the bitmap uptodate bit -	 * get set with buffer lock held. +	 * submit the buffer_head for reading  	 */ +	set_buffer_new(bh);  	trace_ext4_read_block_bitmap_load(sb, block_group); -	set_bitmap_uptodate(bh); -	if (bh_submit_read(bh) < 0) { -		put_bh(bh); +	bh->b_end_io = ext4_end_bitmap_read; +	get_bh(bh); +	submit_bh(READ, bh); +	return bh; +} + +/* Returns 0 on success, 1 on error */ +int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, +			   struct buffer_head *bh) +{ +	struct ext4_group_desc *desc; + +	if (!buffer_new(bh)) +		return 0; +	desc = ext4_get_group_desc(sb, block_group, NULL); +	if (!desc) +		return 1; +	wait_on_buffer(bh); +	if (!buffer_uptodate(bh)) {  		ext4_error(sb, "Cannot read block bitmap - " -			    "block_group = %u, block_bitmap = %llu", -			    block_group, bitmap_blk); -		return NULL; +			   "block_group = %u, block_bitmap = %llu", +			   block_group, (unsigned long long) bh->b_blocknr); +		return 1;  	} +	clear_buffer_new(bh); +	/* Panic or remount fs read-only if block bitmap is invalid */  	ext4_valid_block_bitmap(sb, desc, block_group, bh); -	/* -	 * file system mounted not to panic on error, -	 * continue with corrupt bitmap -	 */ +	return 0; +} + +struct buffer_head * +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +{ +	struct buffer_head *bh; + +	bh = ext4_read_block_bitmap_nowait(sb, block_group); +	if (ext4_wait_block_bitmap(sb, block_group, bh)) { +		put_bh(bh); +		return NULL; +	}  	return bh;  } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 164c56092e5..ad56866d729 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -91,17 +91,17 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,  		return 0;  	if (filp) -		ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0, +		ext4_error_file(filp, function, line, bh->b_blocknr,  				"bad entry in directory: %s - offset=%u(%u), "  				"inode=%u, rec_len=%d, name_len=%d", -				error_msg, (unsigned) (offset%bh->b_size), +				error_msg, (unsigned) (offset % bh->b_size),  				offset, le32_to_cpu(de->inode),  				rlen, de->name_len);  	else -		ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0, +		ext4_error_inode(dir, function, line, bh->b_blocknr,  				"bad entry in directory: %s - offset=%u(%u), "  				"inode=%u, rec_len=%d, name_len=%d", -				error_msg, (unsigned) (offset%bh->b_size), +				error_msg, (unsigned) (offset % bh->b_size),  				offset, le32_to_cpu(de->inode),  				rlen, de->name_len); @@ -425,8 +425,9 @@ static int call_filldir(struct file *filp, void *dirent,  	sb = inode->i_sb;  	if (!fname) { -		printk(KERN_ERR "EXT4-fs: call_filldir: called with " -		       "null fname?!?\n"); +		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " +			 "called with null fname?!?", __func__, __LINE__, +			 inode->i_ino, current->comm);  		return 0;  	}  	curr_pos = hash2pos(fname->hash, fname->minor_hash); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 513004fc3d8..ded731ac8a3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -53,7 +53,7 @@  		printk(KERN_DEBUG f, ## a);				\  	} while (0)  #else -#define ext4_debug(f, a...)	do {} while (0) +#define ext4_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)  #endif  #define EXT4_ERROR_INODE(inode, fmt, a...) \ @@ -184,6 +184,8 @@ struct mpage_da_data {  #define	EXT4_IO_END_UNWRITTEN	0x0001  #define EXT4_IO_END_ERROR	0x0002  #define EXT4_IO_END_QUEUED	0x0004 +#define EXT4_IO_END_DIRECT	0x0008 +#define EXT4_IO_END_IN_FSYNC	0x0010  struct ext4_io_page {  	struct page	*p_page; @@ -192,18 +194,25 @@ struct ext4_io_page {  #define MAX_IO_PAGES 128 +/* + * For converting uninitialized extents on a work queue. + * + * 'page' is only used from the writepage() path; 'pages' is only used for + * buffered writes; they are used to keep page references until conversion + * takes place.  For AIO/DIO, neither field is filled in. + */  typedef struct ext4_io_end {  	struct list_head	list;		/* per-file finished IO list */  	struct inode		*inode;		/* file being written to */  	unsigned int		flag;		/* unwritten or not */ -	struct page		*page;		/* page struct for buffer write */ +	struct page		*page;		/* for writepage() path */  	loff_t			offset;		/* offset in the file */  	ssize_t			size;		/* size of the extent */  	struct work_struct	work;		/* data work queue */  	struct kiocb		*iocb;		/* iocb struct for AIO */  	int			result;		/* error value for AIO */ -	int			num_io_pages; -	struct ext4_io_page	*pages[MAX_IO_PAGES]; +	int			num_io_pages;   /* for writepages() */ +	struct ext4_io_page	*pages[MAX_IO_PAGES]; /* for writepages() */  } ext4_io_end_t;  struct ext4_io_submit { @@ -923,6 +932,7 @@ struct ext4_inode_info {  #define EXT4_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */  #define EXT4_MOUNT_ERRORS_RO		0x00020	/* Remount fs ro on errors */  #define EXT4_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK		0x00070  #define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */  #define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/  #define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */ @@ -941,7 +951,6 @@ struct ext4_inode_info {  #define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */  #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */  #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */ -#define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */  #define EXT4_MOUNT_MBLK_IO_SUBMIT	0x4000000 /* multi-block io submits */  #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */  #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */ @@ -1142,6 +1151,7 @@ struct ext4_sb_info {  	unsigned int s_mount_opt;  	unsigned int s_mount_opt2;  	unsigned int s_mount_flags; +	unsigned int s_def_mount_opt;  	ext4_fsblk_t s_sb_block;  	uid_t s_resuid;  	gid_t s_resgid; @@ -1420,8 +1430,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)  #define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200  #define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */  #define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */ -#define EXT4_FEATURE_INCOMPAT_INLINEDATA	0x2000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM	0x2000 /* use crc32c for bg */  #define EXT4_FEATURE_INCOMPAT_LARGEDIR		0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINEDATA	0x8000 /* data in inode */  #define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR  #define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1794,8 +1805,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,  						    ext4_group_t block_group,  						    struct buffer_head ** bh);  extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); -struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, -				      ext4_group_t block_group); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, +						ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, +				  ext4_group_t block_group, +				  struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, +						  ext4_group_t block_group);  extern void ext4_init_block_bitmap(struct super_block *sb,  				   struct buffer_head *bh,  				   ext4_group_t group, @@ -1841,6 +1858,7 @@ extern void ext4_check_inodes_bitmap(struct super_block *);  extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);  extern int ext4_init_inode_table(struct super_block *sb,  				 ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);  /* mballoc.c */  extern long ext4_mb_stats; diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index a52db3a69a3..0f58b86e3a0 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -47,9 +47,9 @@   */  #define EXT_DEBUG__  #ifdef EXT_DEBUG -#define ext_debug(a...)		printk(a) +#define ext_debug(fmt, ...)	printk(fmt, ##__VA_ARGS__)  #else -#define ext_debug(a...) +#define ext_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)  #endif  /* diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5802fa1dab1..83b20fcf940 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -104,6 +104,78 @@  #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))  #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +/** + *   struct ext4_journal_cb_entry - Base structure for callback information. + * + *   This struct is a 'seed' structure for a using with your own callback + *   structs. If you are using callbacks you must allocate one of these + *   or another struct of your own definition which has this struct + *   as it's first element and pass it to ext4_journal_callback_add(). + */ +struct ext4_journal_cb_entry { +	/* list information for other callbacks attached to the same handle */ +	struct list_head jce_list; + +	/*  Function to call with this callback structure */ +	void (*jce_func)(struct super_block *sb, +			 struct ext4_journal_cb_entry *jce, int error); + +	/* user data goes here */ +}; + +/** + * ext4_journal_callback_add: add a function to call after transaction commit + * @handle: active journal transaction handle to register callback on + * @func: callback function to call after the transaction has committed: + *        @sb: superblock of current filesystem for transaction + *        @jce: returned journal callback data + *        @rc: journal state at commit (0 = transaction committed properly) + * @jce: journal callback data (internal and function private data struct) + * + * The registered function will be called in the context of the journal thread + * after the transaction for which the handle was created has completed. + * + * No locks are held when the callback function is called, so it is safe to + * call blocking functions from within the callback, but the callback should + * not block or run for too long, or the filesystem will be blocked waiting for + * the next transaction to commit. No journaling functions can be used, or + * there is a risk of deadlock. + * + * There is no guaranteed calling order of multiple registered callbacks on + * the same transaction. + */ +static inline void ext4_journal_callback_add(handle_t *handle, +			void (*func)(struct super_block *sb, +				     struct ext4_journal_cb_entry *jce, +				     int rc), +			struct ext4_journal_cb_entry *jce) +{ +	struct ext4_sb_info *sbi = +			EXT4_SB(handle->h_transaction->t_journal->j_private); + +	/* Add the jce to transaction's private list */ +	jce->jce_func = func; +	spin_lock(&sbi->s_md_lock); +	list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); +	spin_unlock(&sbi->s_md_lock); +} + +/** + * ext4_journal_callback_del: delete a registered callback + * @handle: active journal transaction handle on which callback was registered + * @jce: registered journal callback entry to unregister + */ +static inline void ext4_journal_callback_del(handle_t *handle, +					     struct ext4_journal_cb_entry *jce) +{ +	struct ext4_sb_info *sbi = +			EXT4_SB(handle->h_transaction->t_journal->j_private); + +	spin_lock(&sbi->s_md_lock); +	list_del_init(&jce->jce_list); +	spin_unlock(&sbi->s_md_lock); +} +  int  ext4_mark_iloc_dirty(handle_t *handle,  		     struct inode *inode, @@ -261,43 +333,45 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,  /* super.c */  int ext4_force_commit(struct super_block *sb); -static inline int ext4_should_journal_data(struct inode *inode) +/* + * Ext4 inode journal modes + */ +#define EXT4_INODE_JOURNAL_DATA_MODE	0x01 /* journal data mode */ +#define EXT4_INODE_ORDERED_DATA_MODE	0x02 /* ordered data mode */ +#define EXT4_INODE_WRITEBACK_DATA_MODE	0x04 /* writeback data mode */ + +static inline int ext4_inode_journal_mode(struct inode *inode)  {  	if (EXT4_JOURNAL(inode) == NULL) -		return 0; -	if (!S_ISREG(inode->i_mode)) -		return 1; -	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) -		return 1; -	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) -		return 1; -	return 0; +		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */ +	/* We do not support data journalling with delayed allocation */ +	if (!S_ISREG(inode->i_mode) || +	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) +		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */ +	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && +	    !test_opt(inode->i_sb, DELALLOC)) +		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */ +	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) +		return EXT4_INODE_ORDERED_DATA_MODE;	/* ordered */ +	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) +		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */ +	else +		BUG(); +} + +static inline int ext4_should_journal_data(struct inode *inode) +{ +	return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;  }  static inline int ext4_should_order_data(struct inode *inode)  { -	if (EXT4_JOURNAL(inode) == NULL) -		return 0; -	if (!S_ISREG(inode->i_mode)) -		return 0; -	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) -		return 0; -	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) -		return 1; -	return 0; +	return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;  }  static inline int ext4_should_writeback_data(struct inode *inode)  { -	if (EXT4_JOURNAL(inode) == NULL) -		return 1; -	if (!S_ISREG(inode->i_mode)) -		return 0; -	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) -		return 0; -	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) -		return 1; -	return 0; +	return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;  }  /* diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74f23c292e1..1421938e679 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -44,6 +44,14 @@  #include <trace/events/ext4.h> +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \ +					due to ENOSPC */ +#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */ +#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */ +  static int ext4_split_extent(handle_t *handle,  				struct inode *inode,  				struct ext4_ext_path *path, @@ -51,6 +59,13 @@ static int ext4_split_extent(handle_t *handle,  				int split_flag,  				int flags); +static int ext4_split_extent_at(handle_t *handle, +			     struct inode *inode, +			     struct ext4_ext_path *path, +			     ext4_lblk_t split, +			     int split_flag, +			     int flags); +  static int ext4_ext_truncate_extend_restart(handle_t *handle,  					    struct inode *inode,  					    int needed) @@ -300,6 +315,8 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)  	ext4_fsblk_t block = ext4_ext_pblock(ext);  	int len = ext4_ext_get_actual_len(ext); +	if (len == 0) +		return 0;  	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);  } @@ -2308,7 +2325,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  	struct ext4_extent *ex;  	/* the header must be checked already in ext4_ext_remove_space() */ -	ext_debug("truncate since %u in leaf\n", start); +	ext_debug("truncate since %u in leaf to %u\n", start, end);  	if (!path[depth].p_hdr)  		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);  	eh = path[depth].p_hdr; @@ -2343,14 +2360,17 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  		ext_debug("  border %u:%u\n", a, b);  		/* If this extent is beyond the end of the hole, skip it */ -		if (end <= ex_ee_block) { +		if (end < ex_ee_block) {  			ex--;  			ex_ee_block = le32_to_cpu(ex->ee_block);  			ex_ee_len = ext4_ext_get_actual_len(ex);  			continue;  		} else if (b != ex_ee_block + ex_ee_len - 1) { -			EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n", -					 start, end); +			EXT4_ERROR_INODE(inode, +					 "can not handle truncate %u:%u " +					 "on extent %u:%u", +					 start, end, ex_ee_block, +					 ex_ee_block + ex_ee_len - 1);  			err = -EIO;  			goto out;  		} else if (a != ex_ee_block) { @@ -2482,7 +2502,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)  	return 1;  } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, +				 ext4_lblk_t end)  {  	struct super_block *sb = inode->i_sb;  	int depth = ext_depth(inode); @@ -2491,7 +2512,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)  	handle_t *handle;  	int i, err; -	ext_debug("truncate since %u\n", start); +	ext_debug("truncate since %u to %u\n", start, end);  	/* probably first extent we're gonna free will be last in block */  	handle = ext4_journal_start(inode, depth + 1); @@ -2504,6 +2525,61 @@ again:  	trace_ext4_ext_remove_space(inode, start, depth);  	/* +	 * Check if we are removing extents inside the extent tree. If that +	 * is the case, we are going to punch a hole inside the extent tree +	 * so we have to check whether we need to split the extent covering +	 * the last block to remove so we can easily remove the part of it +	 * in ext4_ext_rm_leaf(). +	 */ +	if (end < EXT_MAX_BLOCKS - 1) { +		struct ext4_extent *ex; +		ext4_lblk_t ee_block; + +		/* find extent for this block */ +		path = ext4_ext_find_extent(inode, end, NULL); +		if (IS_ERR(path)) { +			ext4_journal_stop(handle); +			return PTR_ERR(path); +		} +		depth = ext_depth(inode); +		ex = path[depth].p_ext; +		if (!ex) +			goto cont; + +		ee_block = le32_to_cpu(ex->ee_block); + +		/* +		 * See if the last block is inside the extent, if so split +		 * the extent at 'end' block so we can easily remove the +		 * tail of the first part of the split extent in +		 * ext4_ext_rm_leaf(). +		 */ +		if (end >= ee_block && +		    end < ee_block + ext4_ext_get_actual_len(ex) - 1) { +			int split_flag = 0; + +			if (ext4_ext_is_uninitialized(ex)) +				split_flag = EXT4_EXT_MARK_UNINIT1 | +					     EXT4_EXT_MARK_UNINIT2; + +			/* +			 * Split the extent in two so that 'end' is the last +			 * block in the first new extent +			 */ +			err = ext4_split_extent_at(handle, inode, path, +						end + 1, split_flag, +						EXT4_GET_BLOCKS_PRE_IO | +						EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + +			if (err < 0) +				goto out; +		} +		ext4_ext_drop_refs(path); +		kfree(path); +	} +cont: + +	/*  	 * We start scanning from right side, freeing all the blocks  	 * after i_size and walking into the tree depth-wise.  	 */ @@ -2515,6 +2591,7 @@ again:  	}  	path[0].p_depth = depth;  	path[0].p_hdr = ext_inode_hdr(inode); +  	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {  		err = -EIO;  		goto out; @@ -2526,7 +2603,7 @@ again:  			/* this is leaf block */  			err = ext4_ext_rm_leaf(handle, inode, path,  					       &partial_cluster, start, -					       EXT_MAX_BLOCKS - 1); +					       end);  			/* root level has p_bh == NULL, brelse() eats this */  			brelse(path[i].p_bh);  			path[i].p_bh = NULL; @@ -2651,17 +2728,17 @@ void ext4_ext_init(struct super_block *sb)  	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {  #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) -		printk(KERN_INFO "EXT4-fs: file extents enabled"); +		printk(KERN_INFO "EXT4-fs: file extents enabled"  #ifdef AGGRESSIVE_TEST -		printk(", aggressive tests"); +		       ", aggressive tests"  #endif  #ifdef CHECK_BINSEARCH -		printk(", check binsearch"); +		       ", check binsearch"  #endif  #ifdef EXTENTS_STATS -		printk(", stats"); +		       ", stats"  #endif -		printk("\n"); +		       "\n");  #endif  #ifdef EXTENTS_STATS  		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); @@ -2709,14 +2786,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)  }  /* - * used by extent splitting. - */ -#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \ -					due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */ - -/*   * ext4_split_extent_at() splits an extent at given block.   *   * @handle: the journal handle @@ -3224,11 +3293,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,  	depth = ext_depth(inode);  	eh = path[depth].p_hdr; -	if (unlikely(!eh->eh_entries)) { -		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " -				 "EOFBLOCKS_FL set"); -		return -EIO; -	} +	/* +	 * We're going to remove EOFBLOCKS_FL entirely in future so we +	 * do not care for this case anymore. Simply remove the flag +	 * if there are no extents. +	 */ +	if (unlikely(!eh->eh_entries)) +		goto out;  	last_ex = EXT_LAST_EXTENT(eh);  	/*  	 * We should clear the EOFBLOCKS_FL flag if we are writing the @@ -3252,6 +3323,7 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,  	for (i = depth-1; i >= 0; i--)  		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))  			return 0; +out:  	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);  	return ext4_mark_inode_dirty(handle, inode);  } @@ -3710,8 +3782,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  	int free_on_err = 0, err = 0, depth, ret;  	unsigned int allocated = 0, offset = 0;  	unsigned int allocated_clusters = 0; -	unsigned int punched_out = 0; -	unsigned int result = 0;  	struct ext4_allocation_request ar;  	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;  	ext4_lblk_t cluster_offset; @@ -3721,8 +3791,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);  	/* check in cache */ -	if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && -		ext4_ext_in_cache(inode, map->m_lblk, &newex)) { +	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {  		if (!newex.ee_start_lo && !newex.ee_start_hi) {  			if ((sbi->s_cluster_ratio > 1) &&  			    ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) @@ -3790,113 +3859,25 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,  		/* if found extent covers block, simply return it */  		if (in_range(map->m_lblk, ee_block, ee_len)) { -			struct ext4_map_blocks punch_map; -			ext4_fsblk_t partial_cluster = 0; -  			newblock = map->m_lblk - ee_block + ee_start;  			/* number of remaining blocks in the extent */  			allocated = ee_len - (map->m_lblk - ee_block);  			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,  				  ee_block, ee_len, newblock); -			if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) { -				/* -				 * Do not put uninitialized extent -				 * in the cache -				 */ -				if (!ext4_ext_is_uninitialized(ex)) { -					ext4_ext_put_in_cache(inode, ee_block, -						ee_len, ee_start); -					goto out; -				} -				ret = ext4_ext_handle_uninitialized_extents( -					handle, inode, map, path, flags, -					allocated, newblock); -				return ret; -			} -  			/* -			 * Punch out the map length, but only to the -			 * end of the extent +			 * Do not put uninitialized extent +			 * in the cache  			 */ -			punched_out = allocated < map->m_len ? -				allocated : map->m_len; - -			/* -			 * Sense extents need to be converted to -			 * uninitialized, they must fit in an -			 * uninitialized extent -			 */ -			if (punched_out > EXT_UNINIT_MAX_LEN) -				punched_out = EXT_UNINIT_MAX_LEN; - -			punch_map.m_lblk = map->m_lblk; -			punch_map.m_pblk = newblock; -			punch_map.m_len = punched_out; -			punch_map.m_flags = 0; - -			/* Check to see if the extent needs to be split */ -			if (punch_map.m_len != ee_len || -				punch_map.m_lblk != ee_block) { - -				ret = ext4_split_extent(handle, inode, -				path, &punch_map, 0, -				EXT4_GET_BLOCKS_PUNCH_OUT_EXT | -				EXT4_GET_BLOCKS_PRE_IO); - -				if (ret < 0) { -					err = ret; -					goto out2; -				} -				/* -				 * find extent for the block at -				 * the start of the hole -				 */ -				ext4_ext_drop_refs(path); -				kfree(path); - -				path = ext4_ext_find_extent(inode, -				map->m_lblk, NULL); -				if (IS_ERR(path)) { -					err = PTR_ERR(path); -					path = NULL; -					goto out2; -				} - -				depth = ext_depth(inode); -				ex = path[depth].p_ext; -				ee_len = ext4_ext_get_actual_len(ex); -				ee_block = le32_to_cpu(ex->ee_block); -				ee_start = ext4_ext_pblock(ex); - -			} - -			ext4_ext_mark_uninitialized(ex); - -			ext4_ext_invalidate_cache(inode); - -			err = ext4_ext_rm_leaf(handle, inode, path, -					       &partial_cluster, map->m_lblk, -					       map->m_lblk + punched_out); - -			if (!err && path->p_hdr->eh_entries == 0) { -				/* -				 * Punch hole freed all of this sub tree, -				 * so we need to correct eh_depth -				 */ -				err = ext4_ext_get_access(handle, inode, path); -				if (err == 0) { -					ext_inode_hdr(inode)->eh_depth = 0; -					ext_inode_hdr(inode)->eh_max = -					cpu_to_le16(ext4_ext_space_root( -						inode, 0)); - -					err = ext4_ext_dirty( -						handle, inode, path); -				} +			if (!ext4_ext_is_uninitialized(ex)) { +				ext4_ext_put_in_cache(inode, ee_block, +					ee_len, ee_start); +				goto out;  			} - -			goto out2; +			ret = ext4_ext_handle_uninitialized_extents( +				handle, inode, map, path, flags, +				allocated, newblock); +			return ret;  		}  	} @@ -4165,13 +4146,11 @@ out2:  		ext4_ext_drop_refs(path);  		kfree(path);  	} -	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? -			punched_out : allocated;  	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, -		newblock, map->m_len, err ? err : result); +		newblock, map->m_len, err ? err : allocated); -	return err ? err : result; +	return err ? err : allocated;  }  void ext4_ext_truncate(struct inode *inode) @@ -4228,7 +4207,7 @@ void ext4_ext_truncate(struct inode *inode)  	last_block = (inode->i_size + sb->s_blocksize - 1)  			>> EXT4_BLOCK_SIZE_BITS(sb); -	err = ext4_ext_remove_space(inode, last_block); +	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);  	/* In a multi-transaction truncate, we only make the final  	 * transaction synchronous. @@ -4436,10 +4415,11 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,  				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);  		if (ret <= 0) {  			WARN_ON(ret <= 0); -			printk(KERN_ERR "%s: ext4_ext_map_blocks " -				    "returned error inode#%lu, block=%u, " -				    "max_blocks=%u", __func__, -				    inode->i_ino, map.m_lblk, map.m_len); +			ext4_msg(inode->i_sb, KERN_ERR, +				 "%s:%d: inode #%lu: block %u: len %u: " +				 "ext4_ext_map_blocks returned %d", +				 __func__, __LINE__, inode->i_ino, map.m_lblk, +				 map.m_len, ret);  		}  		ext4_mark_inode_dirty(handle, inode);  		ret2 = ext4_journal_stop(handle); @@ -4705,14 +4685,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)  {  	struct inode *inode = file->f_path.dentry->d_inode;  	struct super_block *sb = inode->i_sb; -	struct ext4_ext_cache cache_ex; -	ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks; +	ext4_lblk_t first_block, stop_block;  	struct address_space *mapping = inode->i_mapping; -	struct ext4_map_blocks map;  	handle_t *handle;  	loff_t first_page, last_page, page_len;  	loff_t first_page_offset, last_page_offset; -	int ret, credits, blocks_released, err = 0; +	int credits, err = 0;  	/* No need to punch hole beyond i_size */  	if (offset >= inode->i_size) @@ -4728,10 +4706,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)  		   offset;  	} -	first_block = (offset + sb->s_blocksize - 1) >> -		EXT4_BLOCK_SIZE_BITS(sb); -	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); -  	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;  	last_page = (offset + length) >> PAGE_CACHE_SHIFT; @@ -4810,7 +4784,6 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)  		}  	} -  	/*  	 * If i_size is contained in the last page, we need to  	 * unmap and zero the partial page after i_size @@ -4830,73 +4803,22 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)  		}  	} +	first_block = (offset + sb->s_blocksize - 1) >> +		EXT4_BLOCK_SIZE_BITS(sb); +	stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); +  	/* If there are no blocks to remove, return now */ -	if (first_block >= last_block) +	if (first_block >= stop_block)  		goto out;  	down_write(&EXT4_I(inode)->i_data_sem);  	ext4_ext_invalidate_cache(inode);  	ext4_discard_preallocations(inode); -	/* -	 * Loop over all the blocks and identify blocks -	 * that need to be punched out -	 */ -	iblock = first_block; -	blocks_released = 0; -	while (iblock < last_block) { -		max_blocks = last_block - iblock; -		num_blocks = 1; -		memset(&map, 0, sizeof(map)); -		map.m_lblk = iblock; -		map.m_len = max_blocks; -		ret = ext4_ext_map_blocks(handle, inode, &map, -			EXT4_GET_BLOCKS_PUNCH_OUT_EXT); - -		if (ret > 0) { -			blocks_released += ret; -			num_blocks = ret; -		} else if (ret == 0) { -			/* -			 * If map blocks could not find the block, -			 * then it is in a hole.  If the hole was -			 * not already cached, then map blocks should -			 * put it in the cache.  So we can get the hole -			 * out of the cache -			 */ -			memset(&cache_ex, 0, sizeof(cache_ex)); -			if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) && -				!cache_ex.ec_start) { - -				/* The hole is cached */ -				num_blocks = cache_ex.ec_block + -				cache_ex.ec_len - iblock; +	err = ext4_ext_remove_space(inode, first_block, stop_block - 1); -			} else { -				/* The block could not be identified */ -				err = -EIO; -				break; -			} -		} else { -			/* Map blocks error */ -			err = ret; -			break; -		} - -		if (num_blocks == 0) { -			/* This condition should never happen */ -			ext_debug("Block lookup failed"); -			err = -EIO; -			break; -		} - -		iblock += num_blocks; -	} - -	if (blocks_released > 0) { -		ext4_ext_invalidate_cache(inode); -		ext4_discard_preallocations(inode); -	} +	ext4_ext_invalidate_cache(inode); +	ext4_discard_preallocations(inode);  	if (IS_SYNC(inode))  		ext4_handle_sync(handle); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 00a2cb753ef..bb6c7d81131 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode *inode)  		io = list_entry(ei->i_completed_io_list.next,  				ext4_io_end_t, list);  		list_del_init(&io->list); +		io->flag |= EXT4_IO_END_IN_FSYNC;  		/*  		 * Calling ext4_end_io_nolock() to convert completed  		 * IO to written. @@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode *inode)  		if (ret < 0)  			ret2 = ret;  		spin_lock_irqsave(&ei->i_completed_io_lock, flags); +		io->flag &= ~EXT4_IO_END_IN_FSYNC;  	}  	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);  	return (ret2 < 0) ? ret2 : 0; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 25d8c9781ad..409c2ee7750 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -92,6 +92,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,  	return EXT4_INODES_PER_GROUP(sb);  } +void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) +{ +	if (uptodate) { +		set_buffer_uptodate(bh); +		set_bitmap_uptodate(bh); +	} +	unlock_buffer(bh); +	put_bh(bh); +} +  /*   * Read the inode allocation bitmap for a given block_group, reading   * into the specified slot in the superblock's bitmap cache. @@ -147,18 +157,18 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)  		return bh;  	}  	/* -	 * submit the buffer_head for read. We can -	 * safely mark the bitmap as uptodate now. -	 * We do it here so the bitmap uptodate bit -	 * get set with buffer lock held. +	 * submit the buffer_head for reading  	 */  	trace_ext4_load_inode_bitmap(sb, block_group); -	set_bitmap_uptodate(bh); -	if (bh_submit_read(bh) < 0) { +	bh->b_end_io = ext4_end_bitmap_read; +	get_bh(bh); +	submit_bh(READ, bh); +	wait_on_buffer(bh); +	if (!buffer_uptodate(bh)) {  		put_bh(bh);  		ext4_error(sb, "Cannot read inode bitmap - " -			    "block_group = %u, inode_bitmap = %llu", -			    block_group, bitmap_blk); +			   "block_group = %u, inode_bitmap = %llu", +			   block_group, bitmap_blk);  		return NULL;  	}  	return bh; @@ -194,19 +204,20 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)  	struct ext4_sb_info *sbi;  	int fatal = 0, err, count, cleared; -	if (atomic_read(&inode->i_count) > 1) { -		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", -		       atomic_read(&inode->i_count)); +	if (!sb) { +		printk(KERN_ERR "EXT4-fs: %s:%d: inode on " +		       "nonexistent device\n", __func__, __LINE__);  		return;  	} -	if (inode->i_nlink) { -		printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", -		       inode->i_nlink); +	if (atomic_read(&inode->i_count) > 1) { +		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", +			 __func__, __LINE__, inode->i_ino, +			 atomic_read(&inode->i_count));  		return;  	} -	if (!sb) { -		printk(KERN_ERR "ext4_free_inode: inode on " -		       "nonexistent device\n"); +	if (inode->i_nlink) { +		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", +			 __func__, __LINE__, inode->i_ino, inode->i_nlink);  		return;  	}  	sbi = EXT4_SB(sb); @@ -593,94 +604,6 @@ static int find_group_other(struct super_block *sb, struct inode *parent,  }  /* - * claim the inode from the inode bitmap. If the group - * is uninit we need to take the groups's ext4_group_lock - * and clear the uninit flag. The inode bitmap update - * and group desc uninit flag clear should be done - * after holding ext4_group_lock so that ext4_read_inode_bitmap - * doesn't race with the ext4_claim_inode - */ -static int ext4_claim_inode(struct super_block *sb, -			struct buffer_head *inode_bitmap_bh, -			unsigned long ino, ext4_group_t group, umode_t mode) -{ -	int free = 0, retval = 0, count; -	struct ext4_sb_info *sbi = EXT4_SB(sb); -	struct ext4_group_info *grp = ext4_get_group_info(sb, group); -	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); - -	/* -	 * We have to be sure that new inode allocation does not race with -	 * inode table initialization, because otherwise we may end up -	 * allocating and writing new inode right before sb_issue_zeroout -	 * takes place and overwriting our new inode with zeroes. So we -	 * take alloc_sem to prevent it. -	 */ -	down_read(&grp->alloc_sem); -	ext4_lock_group(sb, group); -	if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) { -		/* not a free inode */ -		retval = 1; -		goto err_ret; -	} -	ino++; -	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || -			ino > EXT4_INODES_PER_GROUP(sb)) { -		ext4_unlock_group(sb, group); -		up_read(&grp->alloc_sem); -		ext4_error(sb, "reserved inode or inode > inodes count - " -			   "block_group = %u, inode=%lu", group, -			   ino + group * EXT4_INODES_PER_GROUP(sb)); -		return 1; -	} -	/* If we didn't allocate from within the initialized part of the inode -	 * table then we need to initialize up to this inode. */ -	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - -		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { -			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); -			/* When marking the block group with -			 * ~EXT4_BG_INODE_UNINIT we don't want to depend -			 * on the value of bg_itable_unused even though -			 * mke2fs could have initialized the same for us. -			 * Instead we calculated the value below -			 */ - -			free = 0; -		} else { -			free = EXT4_INODES_PER_GROUP(sb) - -				ext4_itable_unused_count(sb, gdp); -		} - -		/* -		 * Check the relative inode number against the last used -		 * relative inode number in this group. if it is greater -		 * we need to  update the bg_itable_unused count -		 * -		 */ -		if (ino > free) -			ext4_itable_unused_set(sb, gdp, -					(EXT4_INODES_PER_GROUP(sb) - ino)); -	} -	count = ext4_free_inodes_count(sb, gdp) - 1; -	ext4_free_inodes_set(sb, gdp, count); -	if (S_ISDIR(mode)) { -		count = ext4_used_dirs_count(sb, gdp) + 1; -		ext4_used_dirs_set(sb, gdp, count); -		if (sbi->s_log_groups_per_flex) { -			ext4_group_t f = ext4_flex_group(sbi, group); - -			atomic_inc(&sbi->s_flex_groups[f].used_dirs); -		} -	} -	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); -err_ret: -	ext4_unlock_group(sb, group); -	up_read(&grp->alloc_sem); -	return retval; -} - -/*   * There are two policies for allocating an inode.  If the new inode is   * a directory, then a forward search is made for a block group with both   * free space and a low directory-to-inode ratio; if that fails, then of @@ -741,6 +664,11 @@ got_group:  	if (ret2 == -1)  		goto out; +	/* +	 * Normally we will only go through one pass of this loop, +	 * unless we get unlucky and it turns out the group we selected +	 * had its last inode grabbed by someone else. +	 */  	for (i = 0; i < ngroups; i++, ino = 0) {  		err = -EIO; @@ -757,51 +685,24 @@ repeat_in_this_group:  		ino = ext4_find_next_zero_bit((unsigned long *)  					      inode_bitmap_bh->b_data,  					      EXT4_INODES_PER_GROUP(sb), ino); - -		if (ino < EXT4_INODES_PER_GROUP(sb)) { - -			BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); -			err = ext4_journal_get_write_access(handle, -							    inode_bitmap_bh); -			if (err) -				goto fail; - -			BUFFER_TRACE(group_desc_bh, "get_write_access"); -			err = ext4_journal_get_write_access(handle, -								group_desc_bh); -			if (err) -				goto fail; -			if (!ext4_claim_inode(sb, inode_bitmap_bh, -						ino, group, mode)) { -				/* we won it */ -				BUFFER_TRACE(inode_bitmap_bh, -					"call ext4_handle_dirty_metadata"); -				err = ext4_handle_dirty_metadata(handle, -								 NULL, -							inode_bitmap_bh); -				if (err) -					goto fail; -				/* zero bit is inode number 1*/ -				ino++; -				goto got; -			} -			/* we lost it */ -			ext4_handle_release_buffer(handle, inode_bitmap_bh); -			ext4_handle_release_buffer(handle, group_desc_bh); - -			if (++ino < EXT4_INODES_PER_GROUP(sb)) -				goto repeat_in_this_group; +		if (ino >= EXT4_INODES_PER_GROUP(sb)) { +			if (++group == ngroups) +				group = 0; +			continue;  		} - -		/* -		 * This case is possible in concurrent environment.  It is very -		 * rare.  We cannot repeat the find_group_xxx() call because -		 * that will simply return the same blockgroup, because the -		 * group descriptor metadata has not yet been updated. -		 * So we just go onto the next blockgroup. -		 */ -		if (++group == ngroups) -			group = 0; +		if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { +			ext4_error(sb, "reserved inode found cleared - " +				   "inode=%lu", ino + 1); +			continue; +		} +		ext4_lock_group(sb, group); +		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); +		ext4_unlock_group(sb, group); +		ino++;		/* the inode bitmap is zero-based */ +		if (!ret2) +			goto got; /* we grabbed the inode! */ +		if (ino < EXT4_INODES_PER_GROUP(sb)) +			goto repeat_in_this_group;  	}  	err = -ENOSPC;  	goto out; @@ -838,6 +739,59 @@ got:  		if (err)  			goto fail;  	} + +	BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); +	err = ext4_journal_get_write_access(handle, inode_bitmap_bh); +	if (err) +		goto fail; + +	BUFFER_TRACE(group_desc_bh, "get_write_access"); +	err = ext4_journal_get_write_access(handle, group_desc_bh); +	if (err) +		goto fail; + +	/* Update the relevant bg descriptor fields */ +	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { +		int free; +		struct ext4_group_info *grp = ext4_get_group_info(sb, group); + +		down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ +		ext4_lock_group(sb, group); /* while we modify the bg desc */ +		free = EXT4_INODES_PER_GROUP(sb) - +			ext4_itable_unused_count(sb, gdp); +		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { +			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); +			free = 0; +		} +		/* +		 * Check the relative inode number against the last used +		 * relative inode number in this group. if it is greater +		 * we need to update the bg_itable_unused count +		 */ +		if (ino > free) +			ext4_itable_unused_set(sb, gdp, +					(EXT4_INODES_PER_GROUP(sb) - ino)); +		up_read(&grp->alloc_sem); +	} +	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); +	if (S_ISDIR(mode)) { +		ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); +		if (sbi->s_log_groups_per_flex) { +			ext4_group_t f = ext4_flex_group(sbi, group); + +			atomic_inc(&sbi->s_flex_groups[f].used_dirs); +		} +	} +	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { +		gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); +		ext4_unlock_group(sb, group); +	} + +	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); +	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); +	if (err) +		goto fail; +  	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");  	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);  	if (err) @@ -1101,7 +1055,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)   * where it is called from on active part of filesystem is ext4lazyinit   * thread, so we do not need any special locks, however we have to prevent   * inode allocation from the current group, so we take alloc_sem lock, to - * block ext4_claim_inode until we are finished. + * block ext4_new_inode() until we are finished.   */  int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,  				 int barrier) @@ -1149,9 +1103,9 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,  			    sbi->s_inodes_per_block);  	if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { -		ext4_error(sb, "Something is wrong with group %u\n" -			   "Used itable blocks: %d" -			   "itable unused count: %u\n", +		ext4_error(sb, "Something is wrong with group %u: " +			   "used itable blocks: %d; " +			   "itable unused count: %u",  			   group, used_blks,  			   ext4_itable_unused_count(sb, gdp));  		ret = 1; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index feaa82fe629..c77b0bd2c71 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -272,7 +272,7 @@ void ext4_da_update_reserve_space(struct inode *inode,  	trace_ext4_da_update_reserve_space(inode, used, quota_claim);  	if (unlikely(used > ei->i_reserved_data_blocks)) {  		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " -			 "with only %d reserved data blocks\n", +			 "with only %d reserved data blocks",  			 __func__, inode->i_ino, used,  			 ei->i_reserved_data_blocks);  		WARN_ON(1); @@ -1165,7 +1165,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)  		 */  		ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "  			 "ino %lu, to_free %d with only %d reserved " -			 "data blocks\n", inode->i_ino, to_free, +			 "data blocks", inode->i_ino, to_free,  			 ei->i_reserved_data_blocks);  		WARN_ON(1);  		to_free = ei->i_reserved_data_blocks; @@ -1428,20 +1428,22 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)  static void ext4_print_free_blocks(struct inode *inode)  {  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); -	printk(KERN_CRIT "Total free blocks count %lld\n", +	struct super_block *sb = inode->i_sb; + +	ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",  	       EXT4_C2B(EXT4_SB(inode->i_sb),  			ext4_count_free_clusters(inode->i_sb))); -	printk(KERN_CRIT "Free/Dirty block details\n"); -	printk(KERN_CRIT "free_blocks=%lld\n", +	ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); +	ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",  	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),  		percpu_counter_sum(&sbi->s_freeclusters_counter))); -	printk(KERN_CRIT "dirty_blocks=%lld\n", +	ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",  	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),  		percpu_counter_sum(&sbi->s_dirtyclusters_counter))); -	printk(KERN_CRIT "Block reservation details\n"); -	printk(KERN_CRIT "i_reserved_data_blocks=%u\n", -	       EXT4_I(inode)->i_reserved_data_blocks); -	printk(KERN_CRIT "i_reserved_meta_blocks=%u\n", +	ext4_msg(sb, KERN_CRIT, "Block reservation details"); +	ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", +		 EXT4_I(inode)->i_reserved_data_blocks); +	ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",  	       EXT4_I(inode)->i_reserved_meta_blocks);  	return;  } @@ -2482,13 +2484,14 @@ static int ext4_da_write_end(struct file *file,  	int write_mode = (int)(unsigned long)fsdata;  	if (write_mode == FALL_BACK_TO_NONDELALLOC) { -		if (ext4_should_order_data(inode)) { +		switch (ext4_inode_journal_mode(inode)) { +		case EXT4_INODE_ORDERED_DATA_MODE:  			return ext4_ordered_write_end(file, mapping, pos,  					len, copied, page, fsdata); -		} else if (ext4_should_writeback_data(inode)) { +		case EXT4_INODE_WRITEBACK_DATA_MODE:  			return ext4_writeback_write_end(file, mapping, pos,  					len, copied, page, fsdata); -		} else { +		default:  			BUG();  		}  	} @@ -2763,7 +2766,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,  		goto out;  	ext_debug("ext4_end_io_dio(): io_end 0x%p " -		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", +		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",   		  iocb->private, io_end->inode->i_ino, iocb, offset,  		  size); @@ -2795,9 +2798,6 @@ out:  	/* queue the work to convert unwritten extents to written */  	queue_work(wq, &io_end->work); - -	/* XXX: probably should move into the real I/O completion handler */ -	inode_dio_done(inode);  }  static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) @@ -2811,8 +2811,9 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)  		goto out;  	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { -		printk("sb umounted, discard end_io request for inode %lu\n", -			io_end->inode->i_ino); +		ext4_msg(io_end->inode->i_sb, KERN_INFO, +			 "sb umounted, discard end_io request for inode %lu", +			 io_end->inode->i_ino);  		ext4_free_io_end(io_end);  		goto out;  	} @@ -2921,9 +2922,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  		iocb->private = NULL;  		EXT4_I(inode)->cur_aio_dio = NULL;  		if (!is_sync_kiocb(iocb)) { -			iocb->private = ext4_init_io_end(inode, GFP_NOFS); -			if (!iocb->private) +			ext4_io_end_t *io_end = +				ext4_init_io_end(inode, GFP_NOFS); +			if (!io_end)  				return -ENOMEM; +			io_end->flag |= EXT4_IO_END_DIRECT; +			iocb->private = io_end;  			/*  			 * we save the io structure for current async  			 * direct IO, so that later ext4_map_blocks() @@ -2940,7 +2944,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,  					 ext4_get_block_write,  					 ext4_end_io_dio,  					 NULL, -					 DIO_LOCKING | DIO_SKIP_HOLES); +					 DIO_LOCKING);  		if (iocb->private)  			EXT4_I(inode)->cur_aio_dio = NULL;  		/* @@ -3086,18 +3090,25 @@ static const struct address_space_operations ext4_da_aops = {  void ext4_set_aops(struct inode *inode)  { -	if (ext4_should_order_data(inode) && -		test_opt(inode->i_sb, DELALLOC)) -		inode->i_mapping->a_ops = &ext4_da_aops; -	else if (ext4_should_order_data(inode)) -		inode->i_mapping->a_ops = &ext4_ordered_aops; -	else if (ext4_should_writeback_data(inode) && -		 test_opt(inode->i_sb, DELALLOC)) -		inode->i_mapping->a_ops = &ext4_da_aops; -	else if (ext4_should_writeback_data(inode)) -		inode->i_mapping->a_ops = &ext4_writeback_aops; -	else +	switch (ext4_inode_journal_mode(inode)) { +	case EXT4_INODE_ORDERED_DATA_MODE: +		if (test_opt(inode->i_sb, DELALLOC)) +			inode->i_mapping->a_ops = &ext4_da_aops; +		else +			inode->i_mapping->a_ops = &ext4_ordered_aops; +		break; +	case EXT4_INODE_WRITEBACK_DATA_MODE: +		if (test_opt(inode->i_sb, DELALLOC)) +			inode->i_mapping->a_ops = &ext4_da_aops; +		else +			inode->i_mapping->a_ops = &ext4_writeback_aops; +		break; +	case EXT4_INODE_JOURNAL_DATA_MODE:  		inode->i_mapping->a_ops = &ext4_journalled_aops; +		break; +	default: +		BUG(); +	}  } @@ -3329,16 +3340,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)  {  	struct inode *inode = file->f_path.dentry->d_inode;  	if (!S_ISREG(inode->i_mode)) -		return -ENOTSUPP; +		return -EOPNOTSUPP;  	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {  		/* TODO: Add support for non extent hole punching */ -		return -ENOTSUPP; +		return -EOPNOTSUPP;  	}  	if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {  		/* TODO: Add support for bigalloc file systems */ -		return -ENOTSUPP; +		return -EOPNOTSUPP;  	}  	return ext4_ext_punch_hole(file, offset, length); @@ -3924,10 +3935,8 @@ static int ext4_do_update_inode(handle_t *handle,  			ext4_update_dynamic_rev(sb);  			EXT4_SET_RO_COMPAT_FEATURE(sb,  					EXT4_FEATURE_RO_COMPAT_LARGE_FILE); -			sb->s_dirt = 1;  			ext4_handle_sync(handle); -			err = ext4_handle_dirty_metadata(handle, NULL, -					EXT4_SB(sb)->s_sbh); +			err = ext4_handle_dirty_super(handle, sb);  		}  	}  	raw_inode->i_generation = cpu_to_le32(inode->i_generation); @@ -4152,11 +4161,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)  	}  	if (attr->ia_valid & ATTR_SIZE) { -		if (attr->ia_size != i_size_read(inode)) { +		if (attr->ia_size != i_size_read(inode))  			truncate_setsize(inode, attr->ia_size); -			ext4_truncate(inode); -		} else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) -			ext4_truncate(inode); +		ext4_truncate(inode);  	}  	if (!rc) { @@ -4314,7 +4321,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,  {  	int err = 0; -	if (test_opt(inode->i_sb, I_VERSION)) +	if (IS_I_VERSION(inode))  		inode_inc_iversion(inode);  	/* the do_update_inode consumes one bh->b_count */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cb990b21c69..99ab428bcfa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -21,6 +21,7 @@   * mballoc.c contains the multiblocks allocation routines   */ +#include "ext4_jbd2.h"  #include "mballoc.h"  #include <linux/debugfs.h>  #include <linux/slab.h> @@ -339,7 +340,7 @@   */  static struct kmem_cache *ext4_pspace_cachep;  static struct kmem_cache *ext4_ac_cachep; -static struct kmem_cache *ext4_free_ext_cachep; +static struct kmem_cache *ext4_free_data_cachep;  /* We create slab caches for groupinfo data structures based on the   * superblock block size.  There will be one per mounted filesystem for @@ -357,7 +358,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,  					ext4_group_t group);  static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,  						ext4_group_t group); -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); +static void ext4_free_data_callback(struct super_block *sb, +				struct ext4_journal_cb_entry *jce, int rc);  static inline void *mb_correct_addr_and_bit(int *bit, void *addr)  { @@ -425,7 +427,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)  {  	char *bb; -	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); +	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);  	BUG_ON(max == NULL);  	if (order > e4b->bd_blkbits + 1) { @@ -436,10 +438,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)  	/* at order 0 we see each particular block */  	if (order == 0) {  		*max = 1 << (e4b->bd_blkbits + 3); -		return EXT4_MB_BITMAP(e4b); +		return e4b->bd_bitmap;  	} -	bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; +	bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];  	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];  	return bb; @@ -588,7 +590,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,  			for (j = 0; j < (1 << order); j++) {  				k = (i * (1 << order)) + j;  				MB_CHECK_ASSERT( -					!mb_test_bit(k, EXT4_MB_BITMAP(e4b))); +					!mb_test_bit(k, e4b->bd_bitmap));  			}  			count++;  		} @@ -782,7 +784,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)  	int groups_per_page;  	int err = 0;  	int i; -	ext4_group_t first_group; +	ext4_group_t first_group, group;  	int first_block;  	struct super_block *sb;  	struct buffer_head *bhs; @@ -806,24 +808,23 @@ static int ext4_mb_init_cache(struct page *page, char *incore)  	/* allocate buffer_heads to read bitmaps */  	if (groups_per_page > 1) { -		err = -ENOMEM;  		i = sizeof(struct buffer_head *) * groups_per_page;  		bh = kzalloc(i, GFP_NOFS); -		if (bh == NULL) +		if (bh == NULL) { +			err = -ENOMEM;  			goto out; +		}  	} else  		bh = &bhs;  	first_group = page->index * blocks_per_page / 2;  	/* read all groups the page covers into the cache */ -	for (i = 0; i < groups_per_page; i++) { -		struct ext4_group_desc *desc; - -		if (first_group + i >= ngroups) +	for (i = 0, group = first_group; i < groups_per_page; i++, group++) { +		if (group >= ngroups)  			break; -		grinfo = ext4_get_group_info(sb, first_group + i); +		grinfo = ext4_get_group_info(sb, group);  		/*  		 * If page is uptodate then we came here after online resize  		 * which added some new uninitialized group info structs, so @@ -834,69 +835,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore)  			bh[i] = NULL;  			continue;  		} - -		err = -EIO; -		desc = ext4_get_group_desc(sb, first_group + i, NULL); -		if (desc == NULL) -			goto out; - -		err = -ENOMEM; -		bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); -		if (bh[i] == NULL) +		if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { +			err = -ENOMEM;  			goto out; - -		if (bitmap_uptodate(bh[i])) -			continue; - -		lock_buffer(bh[i]); -		if (bitmap_uptodate(bh[i])) { -			unlock_buffer(bh[i]); -			continue; -		} -		ext4_lock_group(sb, first_group + i); -		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { -			ext4_init_block_bitmap(sb, bh[i], -						first_group + i, desc); -			set_bitmap_uptodate(bh[i]); -			set_buffer_uptodate(bh[i]); -			ext4_unlock_group(sb, first_group + i); -			unlock_buffer(bh[i]); -			continue;  		} -		ext4_unlock_group(sb, first_group + i); -		if (buffer_uptodate(bh[i])) { -			/* -			 * if not uninit if bh is uptodate, -			 * bitmap is also uptodate -			 */ -			set_bitmap_uptodate(bh[i]); -			unlock_buffer(bh[i]); -			continue; -		} -		get_bh(bh[i]); -		/* -		 * submit the buffer_head for read. We can -		 * safely mark the bitmap as uptodate now. -		 * We do it here so the bitmap uptodate bit -		 * get set with buffer lock held. -		 */ -		set_bitmap_uptodate(bh[i]); -		bh[i]->b_end_io = end_buffer_read_sync; -		submit_bh(READ, bh[i]); -		mb_debug(1, "read bitmap for group %u\n", first_group + i); +		mb_debug(1, "read bitmap for group %u\n", group);  	}  	/* wait for I/O completion */ -	for (i = 0; i < groups_per_page; i++) -		if (bh[i]) -			wait_on_buffer(bh[i]); - -	err = -EIO; -	for (i = 0; i < groups_per_page; i++) -		if (bh[i] && !buffer_uptodate(bh[i])) +	for (i = 0, group = first_group; i < groups_per_page; i++, group++) { +		if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { +			err = -EIO;  			goto out; +		} +	} -	err = 0;  	first_block = page->index * blocks_per_page;  	for (i = 0; i < blocks_per_page; i++) {  		int group; @@ -1250,10 +1203,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)  	int order = 1;  	void *bb; -	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); +	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);  	BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); -	bb = EXT4_MB_BUDDY(e4b); +	bb = e4b->bd_buddy;  	while (order <= e4b->bd_blkbits + 1) {  		block = block >> 1;  		if (!mb_test_bit(block, bb)) { @@ -1323,9 +1276,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,  	/* let's maintain fragments counter */  	if (first != 0) -		block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); +		block = !mb_test_bit(first - 1, e4b->bd_bitmap);  	if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) -		max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); +		max = !mb_test_bit(first + count, e4b->bd_bitmap);  	if (block && max)  		e4b->bd_info->bb_fragments--;  	else if (!block && !max) @@ -1336,7 +1289,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,  		block = first++;  		order = 0; -		if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { +		if (!mb_test_bit(block, e4b->bd_bitmap)) {  			ext4_fsblk_t blocknr;  			blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1347,7 +1300,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,  					      "freeing already freed block "  					      "(bit %u)", block);  		} -		mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); +		mb_clear_bit(block, e4b->bd_bitmap);  		e4b->bd_info->bb_counters[order]++;  		/* start of the buddy */ @@ -1429,7 +1382,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,  			break;  		next = (block + 1) * (1 << order); -		if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) +		if (mb_test_bit(next, e4b->bd_bitmap))  			break;  		order = mb_find_order_for_block(e4b, next); @@ -1466,9 +1419,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)  	/* let's maintain fragments counter */  	if (start != 0) -		mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); +		mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);  	if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) -		max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); +		max = !mb_test_bit(start + len, e4b->bd_bitmap);  	if (mlen && max)  		e4b->bd_info->bb_fragments++;  	else if (!mlen && !max) @@ -1511,7 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)  	}  	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); -	ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); +	ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);  	mb_check_buddy(e4b);  	return ret; @@ -1810,7 +1763,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,  					struct ext4_buddy *e4b)  {  	struct super_block *sb = ac->ac_sb; -	void *bitmap = EXT4_MB_BITMAP(e4b); +	void *bitmap = e4b->bd_bitmap;  	struct ext4_free_extent ex;  	int i;  	int free; @@ -1870,7 +1823,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,  {  	struct super_block *sb = ac->ac_sb;  	struct ext4_sb_info *sbi = EXT4_SB(sb); -	void *bitmap = EXT4_MB_BITMAP(e4b); +	void *bitmap = e4b->bd_bitmap;  	struct ext4_free_extent ex;  	ext4_fsblk_t first_group_block;  	ext4_fsblk_t a; @@ -2224,7 +2177,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,  			EXT4_DESC_PER_BLOCK_BITS(sb);  		meta_group_info = kmalloc(metalen, GFP_KERNEL);  		if (meta_group_info == NULL) { -			ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " +			ext4_msg(sb, KERN_ERR, "can't allocate mem "  				 "for a buddy group");  			goto exit_meta_group_info;  		} @@ -2238,7 +2191,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,  	meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);  	if (meta_group_info[i] == NULL) { -		ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); +		ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");  		goto exit_group_info;  	}  	memset(meta_group_info[i], 0, kmem_cache_size(cachep)); @@ -2522,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)  		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,  				 &ext4_mb_seq_groups_fops, sb); -	if (sbi->s_journal) -		sbi->s_journal->j_commit_callback = release_blocks_on_commit; -  	return 0;  out_free_locality_groups: @@ -2637,58 +2587,55 @@ static inline int ext4_issue_discard(struct super_block *sb,   * This function is called by the jbd2 layer once the commit has finished,   * so we know we can free the blocks that were released with that commit.   */ -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) +static void ext4_free_data_callback(struct super_block *sb, +				    struct ext4_journal_cb_entry *jce, +				    int rc)  { -	struct super_block *sb = journal->j_private; +	struct ext4_free_data *entry = (struct ext4_free_data *)jce;  	struct ext4_buddy e4b;  	struct ext4_group_info *db;  	int err, count = 0, count2 = 0; -	struct ext4_free_data *entry; -	struct list_head *l, *ltmp; -	list_for_each_safe(l, ltmp, &txn->t_private_list) { -		entry = list_entry(l, struct ext4_free_data, list); +	mb_debug(1, "gonna free %u blocks in group %u (0x%p):", +		 entry->efd_count, entry->efd_group, entry); -		mb_debug(1, "gonna free %u blocks in group %u (0x%p):", -			 entry->count, entry->group, entry); +	if (test_opt(sb, DISCARD)) +		ext4_issue_discard(sb, entry->efd_group, +				   entry->efd_start_cluster, entry->efd_count); -		if (test_opt(sb, DISCARD)) -			ext4_issue_discard(sb, entry->group, -					   entry->start_cluster, entry->count); +	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); +	/* we expect to find existing buddy because it's pinned */ +	BUG_ON(err != 0); -		err = ext4_mb_load_buddy(sb, entry->group, &e4b); -		/* we expect to find existing buddy because it's pinned */ -		BUG_ON(err != 0); -		db = e4b.bd_info; -		/* there are blocks to put in buddy to make them really free */ -		count += entry->count; -		count2++; -		ext4_lock_group(sb, entry->group); -		/* Take it out of per group rb tree */ -		rb_erase(&entry->node, &(db->bb_free_root)); -		mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count); +	db = e4b.bd_info; +	/* there are blocks to put in buddy to make them really free */ +	count += entry->efd_count; +	count2++; +	ext4_lock_group(sb, entry->efd_group); +	/* Take it out of per group rb tree */ +	rb_erase(&entry->efd_node, &(db->bb_free_root)); +	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); -		/* -		 * Clear the trimmed flag for the group so that the next -		 * ext4_trim_fs can trim it. -		 * If the volume is mounted with -o discard, online discard -		 * is supported and the free blocks will be trimmed online. -		 */ -		if (!test_opt(sb, DISCARD)) -			EXT4_MB_GRP_CLEAR_TRIMMED(db); +	/* +	 * Clear the trimmed flag for the group so that the next +	 * ext4_trim_fs can trim it. +	 * If the volume is mounted with -o discard, online discard +	 * is supported and the free blocks will be trimmed online. +	 */ +	if (!test_opt(sb, DISCARD)) +		EXT4_MB_GRP_CLEAR_TRIMMED(db); -		if (!db->bb_free_root.rb_node) { -			/* No more items in the per group rb tree -			 * balance refcounts from ext4_mb_free_metadata() -			 */ -			page_cache_release(e4b.bd_buddy_page); -			page_cache_release(e4b.bd_bitmap_page); -		} -		ext4_unlock_group(sb, entry->group); -		kmem_cache_free(ext4_free_ext_cachep, entry); -		ext4_mb_unload_buddy(&e4b); +	if (!db->bb_free_root.rb_node) { +		/* No more items in the per group rb tree +		 * balance refcounts from ext4_mb_free_metadata() +		 */ +		page_cache_release(e4b.bd_buddy_page); +		page_cache_release(e4b.bd_bitmap_page);  	} +	ext4_unlock_group(sb, entry->efd_group); +	kmem_cache_free(ext4_free_data_cachep, entry); +	ext4_mb_unload_buddy(&e4b);  	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);  } @@ -2741,9 +2688,9 @@ int __init ext4_init_mballoc(void)  		return -ENOMEM;  	} -	ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, -					  SLAB_RECLAIM_ACCOUNT); -	if (ext4_free_ext_cachep == NULL) { +	ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, +					   SLAB_RECLAIM_ACCOUNT); +	if (ext4_free_data_cachep == NULL) {  		kmem_cache_destroy(ext4_pspace_cachep);  		kmem_cache_destroy(ext4_ac_cachep);  		return -ENOMEM; @@ -2761,7 +2708,7 @@ void ext4_exit_mballoc(void)  	rcu_barrier();  	kmem_cache_destroy(ext4_pspace_cachep);  	kmem_cache_destroy(ext4_ac_cachep); -	kmem_cache_destroy(ext4_free_ext_cachep); +	kmem_cache_destroy(ext4_free_data_cachep);  	ext4_groupinfo_destroy_slabs();  	ext4_remove_debugfs_entry();  } @@ -2815,7 +2762,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,  	len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);  	if (!ext4_data_block_valid(sbi, block, len)) {  		ext4_error(sb, "Allocating blocks %llu-%llu which overlap " -			   "fs metadata\n", block, block+len); +			   "fs metadata", block, block+len);  		/* File system mounted not to panic on error  		 * Fix the bitmap and repeat the block allocation  		 * We leak some of the blocks here. @@ -2911,7 +2858,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,  	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);  	int bsbits, max;  	ext4_lblk_t end; -	loff_t size, orig_size, start_off; +	loff_t size, start_off; +	loff_t orig_size __maybe_unused;  	ext4_lblk_t start;  	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);  	struct ext4_prealloc_space *pa; @@ -3321,8 +3269,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,  	n = rb_first(&(grp->bb_free_root));  	while (n) { -		entry = rb_entry(n, struct ext4_free_data, node); -		ext4_set_bits(bitmap, entry->start_cluster, entry->count); +		entry = rb_entry(n, struct ext4_free_data, efd_node); +		ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);  		n = rb_next(n);  	}  	return; @@ -3916,11 +3864,11 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)  	    (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))  		return; -	ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" +	ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"  			" Allocation context details:"); -	ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", +	ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",  			ac->ac_status, ac->ac_flags); -	ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " +	ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "  		 	"goal %lu/%lu/%lu@%lu, "  			"best %lu/%lu/%lu@%lu cr %d",  			(unsigned long)ac->ac_o_ex.fe_group, @@ -3936,9 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)  			(unsigned long)ac->ac_b_ex.fe_len,  			(unsigned long)ac->ac_b_ex.fe_logical,  			(int)ac->ac_criteria); -	ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", +	ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",  		 ac->ac_ex_scanned, ac->ac_found); -	ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); +	ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");  	ngroups = ext4_get_groups_count(sb);  	for (i = 0; i < ngroups; i++) {  		struct ext4_group_info *grp = ext4_get_group_info(sb, i); @@ -4428,9 +4376,9 @@ out:  static int can_merge(struct ext4_free_data *entry1,  			struct ext4_free_data *entry2)  { -	if ((entry1->t_tid == entry2->t_tid) && -	    (entry1->group == entry2->group) && -	    ((entry1->start_cluster + entry1->count) == entry2->start_cluster)) +	if ((entry1->efd_tid == entry2->efd_tid) && +	    (entry1->efd_group == entry2->efd_group) && +	    ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))  		return 1;  	return 0;  } @@ -4452,8 +4400,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,  	BUG_ON(e4b->bd_bitmap_page == NULL);  	BUG_ON(e4b->bd_buddy_page == NULL); -	new_node = &new_entry->node; -	cluster = new_entry->start_cluster; +	new_node = &new_entry->efd_node; +	cluster = new_entry->efd_start_cluster;  	if (!*n) {  		/* first free block exent. We need to @@ -4466,10 +4414,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,  	}  	while (*n) {  		parent = *n; -		entry = rb_entry(parent, struct ext4_free_data, node); -		if (cluster < entry->start_cluster) +		entry = rb_entry(parent, struct ext4_free_data, efd_node); +		if (cluster < entry->efd_start_cluster)  			n = &(*n)->rb_left; -		else if (cluster >= (entry->start_cluster + entry->count)) +		else if (cluster >= (entry->efd_start_cluster + entry->efd_count))  			n = &(*n)->rb_right;  		else {  			ext4_grp_locked_error(sb, group, 0, @@ -4486,34 +4434,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,  	/* Now try to see the extent can be merged to left and right */  	node = rb_prev(new_node);  	if (node) { -		entry = rb_entry(node, struct ext4_free_data, node); +		entry = rb_entry(node, struct ext4_free_data, efd_node);  		if (can_merge(entry, new_entry)) { -			new_entry->start_cluster = entry->start_cluster; -			new_entry->count += entry->count; +			new_entry->efd_start_cluster = entry->efd_start_cluster; +			new_entry->efd_count += entry->efd_count;  			rb_erase(node, &(db->bb_free_root)); -			spin_lock(&sbi->s_md_lock); -			list_del(&entry->list); -			spin_unlock(&sbi->s_md_lock); -			kmem_cache_free(ext4_free_ext_cachep, entry); +			ext4_journal_callback_del(handle, &entry->efd_jce); +			kmem_cache_free(ext4_free_data_cachep, entry);  		}  	}  	node = rb_next(new_node);  	if (node) { -		entry = rb_entry(node, struct ext4_free_data, node); +		entry = rb_entry(node, struct ext4_free_data, efd_node);  		if (can_merge(new_entry, entry)) { -			new_entry->count += entry->count; +			new_entry->efd_count += entry->efd_count;  			rb_erase(node, &(db->bb_free_root)); -			spin_lock(&sbi->s_md_lock); -			list_del(&entry->list); -			spin_unlock(&sbi->s_md_lock); -			kmem_cache_free(ext4_free_ext_cachep, entry); +			ext4_journal_callback_del(handle, &entry->efd_jce); +			kmem_cache_free(ext4_free_data_cachep, entry);  		}  	}  	/* Add the extent to transaction's private list */ -	spin_lock(&sbi->s_md_lock); -	list_add(&new_entry->list, &handle->h_transaction->t_private_list); -	spin_unlock(&sbi->s_md_lock); +	ext4_journal_callback_add(handle, ext4_free_data_callback, +				  &new_entry->efd_jce);  	return 0;  } @@ -4691,15 +4634,15 @@ do_more:  		 * blocks being freed are metadata. these blocks shouldn't  		 * be used until this transaction is committed  		 */ -		new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); +		new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);  		if (!new_entry) {  			err = -ENOMEM;  			goto error_return;  		} -		new_entry->start_cluster = bit; -		new_entry->group  = block_group; -		new_entry->count = count_clusters; -		new_entry->t_tid = handle->h_transaction->t_tid; +		new_entry->efd_start_cluster = bit; +		new_entry->efd_group = block_group; +		new_entry->efd_count = count_clusters; +		new_entry->efd_tid = handle->h_transaction->t_tid;  		ext4_lock_group(sb, block_group);  		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); @@ -4971,11 +4914,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,  	start = (e4b.bd_info->bb_first_free > start) ?  		e4b.bd_info->bb_first_free : start; -	while (start < max) { -		start = mb_find_next_zero_bit(bitmap, max, start); -		if (start >= max) +	while (start <= max) { +		start = mb_find_next_zero_bit(bitmap, max + 1, start); +		if (start > max)  			break; -		next = mb_find_next_bit(bitmap, max, start); +		next = mb_find_next_bit(bitmap, max + 1, start);  		if ((next - start) >= minblocks) {  			ext4_trim_extent(sb, start, @@ -5027,37 +4970,36 @@ out:  int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)  {  	struct ext4_group_info *grp; -	ext4_group_t first_group, last_group; -	ext4_group_t group, ngroups = ext4_get_groups_count(sb); +	ext4_group_t group, first_group, last_group;  	ext4_grpblk_t cnt = 0, first_cluster, last_cluster; -	uint64_t start, len, minlen, trimmed = 0; +	uint64_t start, end, minlen, trimmed = 0;  	ext4_fsblk_t first_data_blk =  			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +	ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);  	int ret = 0;  	start = range->start >> sb->s_blocksize_bits; -	len = range->len >> sb->s_blocksize_bits; +	end = start + (range->len >> sb->s_blocksize_bits) - 1;  	minlen = range->minlen >> sb->s_blocksize_bits; -	if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb))) +	if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) || +	    unlikely(start >= max_blks))  		return -EINVAL; -	if (start + len <= first_data_blk) +	if (end >= max_blks) +		end = max_blks - 1; +	if (end <= first_data_blk)  		goto out; -	if (start < first_data_blk) { -		len -= first_data_blk - start; +	if (start < first_data_blk)  		start = first_data_blk; -	} -	/* Determine first and last group to examine based on start and len */ +	/* Determine first and last group to examine based on start and end */  	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,  				     &first_group, &first_cluster); -	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), +	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,  				     &last_group, &last_cluster); -	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; -	last_cluster = EXT4_CLUSTERS_PER_GROUP(sb); -	if (first_group > last_group) -		return -EINVAL; +	/* end now represents the last cluster to discard in this group */ +	end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;  	for (group = first_group; group <= last_group; group++) {  		grp = ext4_get_group_info(sb, group); @@ -5069,31 +5011,35 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)  		}  		/* -		 * For all the groups except the last one, last block will -		 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to -		 * change it for the last group in which case start + -		 * len < EXT4_BLOCKS_PER_GROUP(sb). +		 * For all the groups except the last one, last cluster will +		 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to +		 * change it for the last group, note that last_cluster is +		 * already computed earlier by ext4_get_group_no_and_offset()  		 */ -		if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb)) -			last_cluster = first_cluster + len; -		len -= last_cluster - first_cluster; +		if (group == last_group) +			end = last_cluster;  		if (grp->bb_free >= minlen) {  			cnt = ext4_trim_all_free(sb, group, first_cluster, -						last_cluster, minlen); +						end, minlen);  			if (cnt < 0) {  				ret = cnt;  				break;  			} +			trimmed += cnt;  		} -		trimmed += cnt; + +		/* +		 * For every group except the first one, we are sure +		 * that the first cluster to discard will be cluster #0. +		 */  		first_cluster = 0;  	} -	range->len = trimmed * sb->s_blocksize;  	if (!ret)  		atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);  out: +	range->len = trimmed * sb->s_blocksize;  	return ret;  } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 47705f3285e..c070618c21c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -96,21 +96,23 @@ extern u8 mb_enable_debug;  struct ext4_free_data { -	/* this links the free block information from group_info */ -	struct rb_node node; +	/* MUST be the first member */ +	struct ext4_journal_cb_entry	efd_jce; + +	/* ext4_free_data private data starts from here */ -	/* this links the free block information from ext4_sb_info */ -	struct list_head list; +	/* this links the free block information from group_info */ +	struct rb_node			efd_node;  	/* group which free block extent belongs */ -	ext4_group_t group; +	ext4_group_t			efd_group;  	/* free block extent */ -	ext4_grpblk_t start_cluster; -	ext4_grpblk_t count; +	ext4_grpblk_t			efd_start_cluster; +	ext4_grpblk_t			efd_count;  	/* transaction which freed this extent */ -	tid_t	t_tid; +	tid_t				efd_tid;  };  struct ext4_prealloc_space { @@ -210,8 +212,6 @@ struct ext4_buddy {  	__u16 bd_blkbits;  	ext4_group_t bd_group;  }; -#define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap) -#define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)  static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,  					struct ext4_free_extent *fex) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index e7d6bb0acfa..f39f80f8f2c 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -471,7 +471,7 @@ int ext4_ext_migrate(struct inode *inode)  	tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,  				   S_IFREG, NULL, goal, owner);  	if (IS_ERR(tmp_inode)) { -		retval = PTR_ERR(inode); +		retval = PTR_ERR(tmp_inode);  		ext4_journal_stop(handle);  		return retval;  	} diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 7ea4ba4eff2..ed6548d8916 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -257,8 +257,8 @@ int ext4_multi_mount_protect(struct super_block *sb,  	 * If check_interval in MMP block is larger, use that instead of  	 * update_interval from the superblock.  	 */ -	if (mmp->mmp_check_interval > mmp_check_interval) -		mmp_check_interval = mmp->mmp_check_interval; +	if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval) +		mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);  	seq = le32_to_cpu(mmp->mmp_seq);  	if (seq == EXT4_MMP_SEQ_CLEAN) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2043f482375..349d7b3671c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -468,7 +468,7 @@ fail2:  fail:  	if (*err == ERR_BAD_DX_DIR)  		ext4_warning(dir->i_sb, -			     "Corrupt dir inode %ld, running e2fsck is " +			     "Corrupt dir inode %lu, running e2fsck is "  			     "recommended.", dir->i_ino);  	return NULL;  } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 47585189651..74cd1f7f1f8 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -60,7 +60,6 @@ void ext4_ioend_wait(struct inode *inode)  static void put_io_page(struct ext4_io_page *io_page)  {  	if (atomic_dec_and_test(&io_page->p_count)) { -		end_page_writeback(io_page->p_page);  		put_page(io_page->p_page);  		kmem_cache_free(io_page_cachep, io_page);  	} @@ -110,6 +109,8 @@ int ext4_end_io_nolock(ext4_io_end_t *io)  	if (io->iocb)  		aio_complete(io->iocb, io->result, 0); +	if (io->flag & EXT4_IO_END_DIRECT) +		inode_dio_done(inode);  	/* Wake up anyone waiting on unwritten extent conversion */  	if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))  		wake_up_all(ext4_ioend_wq(io->inode)); @@ -127,12 +128,18 @@ static void ext4_end_io_work(struct work_struct *work)  	unsigned long		flags;  	spin_lock_irqsave(&ei->i_completed_io_lock, flags); +	if (io->flag & EXT4_IO_END_IN_FSYNC) +		goto requeue;  	if (list_empty(&io->list)) {  		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);  		goto free;  	}  	if (!mutex_trylock(&inode->i_mutex)) { +		bool was_queued; +requeue: +		was_queued = !!(io->flag & EXT4_IO_END_QUEUED); +		io->flag |= EXT4_IO_END_QUEUED;  		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);  		/*  		 * Requeue the work instead of waiting so that the work @@ -145,9 +152,8 @@ static void ext4_end_io_work(struct work_struct *work)  		 * yield the cpu if it sees an end_io request that has already  		 * been requeued.  		 */ -		if (io->flag & EXT4_IO_END_QUEUED) +		if (was_queued)  			yield(); -		io->flag |= EXT4_IO_END_QUEUED;  		return;  	}  	list_del_init(&io->list); @@ -227,9 +233,9 @@ static void ext4_end_bio(struct bio *bio, int error)  			} while (bh != head);  		} -		put_io_page(io_end->pages[i]); +		if (atomic_read(&io_end->pages[i]->p_count) == 1) +			end_page_writeback(io_end->pages[i]->p_page);  	} -	io_end->num_io_pages = 0;  	inode = io_end->inode;  	if (error) { @@ -421,6 +427,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,  	 * PageWriteback bit from the page to prevent the system from  	 * wedging later on.  	 */ +	if (atomic_read(&io_page->p_count) == 1) +		end_page_writeback(page);  	put_io_page(io_page);  	return ret;  } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f9d948f0eb8..59fa0be2725 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1163,8 +1163,11 @@ static void ext4_update_super(struct super_block *sb,  	do_div(reserved_blocks, 100);  	ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); +	ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);  	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *  		     flex_gd->count); +	le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * +		     flex_gd->count);  	/*  	 * We need to protect s_groups_count against other CPUs seeing @@ -1465,6 +1468,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,  	}  	ext4_blocks_count_set(es, o_blocks_count + add); +	ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);  	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,  		   o_blocks_count + add);  	/* We add the blocks to the bitmap and set the group need init bit */ @@ -1512,16 +1516,17 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,  	o_blocks_count = ext4_blocks_count(es);  	if (test_opt(sb, DEBUG)) -		printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", -		       o_blocks_count, n_blocks_count); +		ext4_msg(sb, KERN_DEBUG, +			 "extending last group from %llu to %llu blocks", +			 o_blocks_count, n_blocks_count);  	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)  		return 0;  	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { -		printk(KERN_ERR "EXT4-fs: filesystem on %s:" -			" too large to resize to %llu blocks safely\n", -			sb->s_id, n_blocks_count); +		ext4_msg(sb, KERN_ERR, +			 "filesystem too large to resize to %llu blocks safely", +			 n_blocks_count);  		if (sizeof(sector_t) < 8)  			ext4_warning(sb, "CONFIG_LBDAF not enabled");  		return -EINVAL; @@ -1582,7 +1587,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)  	ext4_fsblk_t o_blocks_count;  	ext4_group_t o_group;  	ext4_group_t n_group; -	ext4_grpblk_t offset; +	ext4_grpblk_t offset, add;  	unsigned long n_desc_blocks;  	unsigned long o_desc_blocks;  	unsigned long desc_blocks; @@ -1591,8 +1596,8 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)  	o_blocks_count = ext4_blocks_count(es);  	if (test_opt(sb, DEBUG)) -		printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu " -		       "upto %llu blocks\n", o_blocks_count, n_blocks_count); +		ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu " +		       "to %llu blocks", o_blocks_count, n_blocks_count);  	if (n_blocks_count < o_blocks_count) {  		/* On-line shrinking not supported */ @@ -1605,7 +1610,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)  		return 0;  	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); -	ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset); +	ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);  	n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /  			EXT4_DESC_PER_BLOCK(sb); @@ -1634,10 +1639,12 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)  	}  	brelse(bh); -	if (offset != 0) { -		/* extend the last group */ -		ext4_grpblk_t add; -		add = EXT4_BLOCKS_PER_GROUP(sb) - offset; +	/* extend the last group */ +	if (n_group == o_group) +		add = n_blocks_count - o_blocks_count; +	else +		add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1); +	if (add > 0) {  		err = ext4_group_extend_no_check(sb, o_blocks_count, add);  		if (err)  			goto out; @@ -1674,7 +1681,7 @@ out:  	iput(resize_inode);  	if (test_opt(sb, DEBUG)) -		printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu " -		       "upto %llu blocks\n", o_blocks_count, n_blocks_count); +		ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu " +		       "upto %llu blocks", o_blocks_count, n_blocks_count);  	return err;  } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 502c61fd739..ceebaf853be 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -62,6 +62,7 @@ static struct ext4_features *ext4_feat;  static int ext4_load_journal(struct super_block *, struct ext4_super_block *,  			     unsigned long journal_devnum); +static int ext4_show_options(struct seq_file *seq, struct dentry *root);  static int ext4_commit_super(struct super_block *sb, int sync);  static void ext4_mark_recovery_complete(struct super_block *sb,  					struct ext4_super_block *es); @@ -375,7 +376,7 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,  	if (is_handle_aborted(handle))  		return; -	printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", +	printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",  	       caller, line, errstr, err_fn);  	jbd2_journal_abort_handle(handle); @@ -431,6 +432,22 @@ static int block_device_ejected(struct super_block *sb)  	return bdi->dev == NULL;  } +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) +{ +	struct super_block		*sb = journal->j_private; +	struct ext4_sb_info		*sbi = EXT4_SB(sb); +	int				error = is_journal_aborted(journal); +	struct ext4_journal_cb_entry	*jce, *tmp; + +	spin_lock(&sbi->s_md_lock); +	list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { +		list_del_init(&jce->jce_list); +		spin_unlock(&sbi->s_md_lock); +		jce->jce_func(sb, jce, error); +		spin_lock(&sbi->s_md_lock); +	} +	spin_unlock(&sbi->s_md_lock); +}  /* Deal with the reporting of failure conditions on a filesystem such as   * inconsistencies detected or read IO failures. @@ -498,11 +515,16 @@ void ext4_error_inode(struct inode *inode, const char *function,  	va_start(args, fmt);  	vaf.fmt = fmt;  	vaf.va = &args; -	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", -	       inode->i_sb->s_id, function, line, inode->i_ino);  	if (block) -		printk(KERN_CONT "block %llu: ", block); -	printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf); +		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " +		       "inode #%lu: block %llu: comm %s: %pV\n", +		       inode->i_sb->s_id, function, line, inode->i_ino, +		       block, current->comm, &vaf); +	else +		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " +		       "inode #%lu: comm %s: %pV\n", +		       inode->i_sb->s_id, function, line, inode->i_ino, +		       current->comm, &vaf);  	va_end(args);  	ext4_handle_error(inode->i_sb); @@ -524,15 +546,21 @@ void ext4_error_file(struct file *file, const char *function,  	path = d_path(&(file->f_path), pathname, sizeof(pathname));  	if (IS_ERR(path))  		path = "(unknown)"; -	printk(KERN_CRIT -	       "EXT4-fs error (device %s): %s:%d: inode #%lu: ", -	       inode->i_sb->s_id, function, line, inode->i_ino); -	if (block) -		printk(KERN_CONT "block %llu: ", block);  	va_start(args, fmt);  	vaf.fmt = fmt;  	vaf.va = &args; -	printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf); +	if (block) +		printk(KERN_CRIT +		       "EXT4-fs error (device %s): %s:%d: inode #%lu: " +		       "block %llu: comm %s: path %s: %pV\n", +		       inode->i_sb->s_id, function, line, inode->i_ino, +		       block, current->comm, path, &vaf); +	else +		printk(KERN_CRIT +		       "EXT4-fs error (device %s): %s:%d: inode #%lu: " +		       "comm %s: path %s: %pV\n", +		       inode->i_sb->s_id, function, line, inode->i_ino, +		       current->comm, path, &vaf);  	va_end(args);  	ext4_handle_error(inode->i_sb); @@ -808,9 +836,6 @@ static void ext4_put_super(struct super_block *sb)  	destroy_workqueue(sbi->dio_unwritten_wq);  	lock_super(sb); -	if (sb->s_dirt) -		ext4_commit_super(sb, 1); -  	if (sbi->s_journal) {  		err = jbd2_journal_destroy(sbi->s_journal);  		sbi->s_journal = NULL; @@ -827,9 +852,12 @@ static void ext4_put_super(struct super_block *sb)  	if (!(sb->s_flags & MS_RDONLY)) {  		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);  		es->s_state = cpu_to_le16(sbi->s_mount_state); -		ext4_commit_super(sb, 1);  	} +	if (sb->s_dirt || !(sb->s_flags & MS_RDONLY)) +		ext4_commit_super(sb, 1); +  	if (sbi->s_proc) { +		remove_proc_entry("options", sbi->s_proc);  		remove_proc_entry(sb->s_id, ext4_proc_root);  	}  	kobject_del(&sbi->s_kobj); @@ -990,180 +1018,6 @@ void ext4_clear_inode(struct inode *inode)  	}  } -static inline void ext4_show_quota_options(struct seq_file *seq, -					   struct super_block *sb) -{ -#if defined(CONFIG_QUOTA) -	struct ext4_sb_info *sbi = EXT4_SB(sb); - -	if (sbi->s_jquota_fmt) { -		char *fmtname = ""; - -		switch (sbi->s_jquota_fmt) { -		case QFMT_VFS_OLD: -			fmtname = "vfsold"; -			break; -		case QFMT_VFS_V0: -			fmtname = "vfsv0"; -			break; -		case QFMT_VFS_V1: -			fmtname = "vfsv1"; -			break; -		} -		seq_printf(seq, ",jqfmt=%s", fmtname); -	} - -	if (sbi->s_qf_names[USRQUOTA]) -		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); - -	if (sbi->s_qf_names[GRPQUOTA]) -		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - -	if (test_opt(sb, USRQUOTA)) -		seq_puts(seq, ",usrquota"); - -	if (test_opt(sb, GRPQUOTA)) -		seq_puts(seq, ",grpquota"); -#endif -} - -/* - * Show an option if - *  - it's set to a non-default value OR - *  - if the per-sb default is different from the global default - */ -static int ext4_show_options(struct seq_file *seq, struct dentry *root) -{ -	int def_errors; -	unsigned long def_mount_opts; -	struct super_block *sb = root->d_sb; -	struct ext4_sb_info *sbi = EXT4_SB(sb); -	struct ext4_super_block *es = sbi->s_es; - -	def_mount_opts = le32_to_cpu(es->s_default_mount_opts); -	def_errors     = le16_to_cpu(es->s_errors); - -	if (sbi->s_sb_block != 1) -		seq_printf(seq, ",sb=%llu", sbi->s_sb_block); -	if (test_opt(sb, MINIX_DF)) -		seq_puts(seq, ",minixdf"); -	if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS)) -		seq_puts(seq, ",grpid"); -	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS)) -		seq_puts(seq, ",nogrpid"); -	if (sbi->s_resuid != EXT4_DEF_RESUID || -	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) { -		seq_printf(seq, ",resuid=%u", sbi->s_resuid); -	} -	if (sbi->s_resgid != EXT4_DEF_RESGID || -	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) { -		seq_printf(seq, ",resgid=%u", sbi->s_resgid); -	} -	if (test_opt(sb, ERRORS_RO)) { -		if (def_errors == EXT4_ERRORS_PANIC || -		    def_errors == EXT4_ERRORS_CONTINUE) { -			seq_puts(seq, ",errors=remount-ro"); -		} -	} -	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) -		seq_puts(seq, ",errors=continue"); -	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) -		seq_puts(seq, ",errors=panic"); -	if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16)) -		seq_puts(seq, ",nouid32"); -	if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) -		seq_puts(seq, ",debug"); -#ifdef CONFIG_EXT4_FS_XATTR -	if (test_opt(sb, XATTR_USER)) -		seq_puts(seq, ",user_xattr"); -	if (!test_opt(sb, XATTR_USER)) -		seq_puts(seq, ",nouser_xattr"); -#endif -#ifdef CONFIG_EXT4_FS_POSIX_ACL -	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) -		seq_puts(seq, ",acl"); -	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) -		seq_puts(seq, ",noacl"); -#endif -	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { -		seq_printf(seq, ",commit=%u", -			   (unsigned) (sbi->s_commit_interval / HZ)); -	} -	if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { -		seq_printf(seq, ",min_batch_time=%u", -			   (unsigned) sbi->s_min_batch_time); -	} -	if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { -		seq_printf(seq, ",max_batch_time=%u", -			   (unsigned) sbi->s_max_batch_time); -	} - -	/* -	 * We're changing the default of barrier mount option, so -	 * let's always display its mount state so it's clear what its -	 * status is. -	 */ -	seq_puts(seq, ",barrier="); -	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); -	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) -		seq_puts(seq, ",journal_async_commit"); -	else if (test_opt(sb, JOURNAL_CHECKSUM)) -		seq_puts(seq, ",journal_checksum"); -	if (test_opt(sb, I_VERSION)) -		seq_puts(seq, ",i_version"); -	if (!test_opt(sb, DELALLOC) && -	    !(def_mount_opts & EXT4_DEFM_NODELALLOC)) -		seq_puts(seq, ",nodelalloc"); - -	if (!test_opt(sb, MBLK_IO_SUBMIT)) -		seq_puts(seq, ",nomblk_io_submit"); -	if (sbi->s_stripe) -		seq_printf(seq, ",stripe=%lu", sbi->s_stripe); -	/* -	 * journal mode get enabled in different ways -	 * So just print the value even if we didn't specify it -	 */ -	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) -		seq_puts(seq, ",data=journal"); -	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) -		seq_puts(seq, ",data=ordered"); -	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) -		seq_puts(seq, ",data=writeback"); - -	if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) -		seq_printf(seq, ",inode_readahead_blks=%u", -			   sbi->s_inode_readahead_blks); - -	if (test_opt(sb, DATA_ERR_ABORT)) -		seq_puts(seq, ",data_err=abort"); - -	if (test_opt(sb, NO_AUTO_DA_ALLOC)) -		seq_puts(seq, ",noauto_da_alloc"); - -	if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD)) -		seq_puts(seq, ",discard"); - -	if (test_opt(sb, NOLOAD)) -		seq_puts(seq, ",norecovery"); - -	if (test_opt(sb, DIOREAD_NOLOCK)) -		seq_puts(seq, ",dioread_nolock"); - -	if (test_opt(sb, BLOCK_VALIDITY) && -	    !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) -		seq_puts(seq, ",block_validity"); - -	if (!test_opt(sb, INIT_INODE_TABLE)) -		seq_puts(seq, ",noinit_itable"); -	else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) -		seq_printf(seq, ",init_itable=%u", -			   (unsigned) sbi->s_li_wait_mult); - -	ext4_show_quota_options(seq, sb); - -	return 0; -} -  static struct inode *ext4_nfs_get_inode(struct super_block *sb,  					u64 ino, u32 generation)  { @@ -1316,18 +1170,17 @@ static const struct export_operations ext4_export_ops = {  enum {  	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,  	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, -	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, +	Opt_nouid32, Opt_debug, Opt_removed,  	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, -	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh, +	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,  	Opt_commit, Opt_min_batch_time, Opt_max_batch_time, -	Opt_journal_update, Opt_journal_dev, -	Opt_journal_checksum, Opt_journal_async_commit, +	Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,  	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,  	Opt_data_err_abort, Opt_data_err_ignore,  	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,  	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, -	Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, -	Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, +	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, +	Opt_usrquota, Opt_grpquota, Opt_i_version,  	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,  	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,  	Opt_inode_readahead_blks, Opt_journal_ioprio, @@ -1350,20 +1203,19 @@ static const match_table_t tokens = {  	{Opt_err_ro, "errors=remount-ro"},  	{Opt_nouid32, "nouid32"},  	{Opt_debug, "debug"}, -	{Opt_oldalloc, "oldalloc"}, -	{Opt_orlov, "orlov"}, +	{Opt_removed, "oldalloc"}, +	{Opt_removed, "orlov"},  	{Opt_user_xattr, "user_xattr"},  	{Opt_nouser_xattr, "nouser_xattr"},  	{Opt_acl, "acl"},  	{Opt_noacl, "noacl"}, -	{Opt_noload, "noload"},  	{Opt_noload, "norecovery"}, -	{Opt_nobh, "nobh"}, -	{Opt_bh, "bh"}, +	{Opt_noload, "noload"}, +	{Opt_removed, "nobh"}, +	{Opt_removed, "bh"},  	{Opt_commit, "commit=%u"},  	{Opt_min_batch_time, "min_batch_time=%u"},  	{Opt_max_batch_time, "max_batch_time=%u"}, -	{Opt_journal_update, "journal=update"},  	{Opt_journal_dev, "journal_dev=%u"},  	{Opt_journal_checksum, "journal_checksum"},  	{Opt_journal_async_commit, "journal_async_commit"}, @@ -1389,7 +1241,6 @@ static const match_table_t tokens = {  	{Opt_nobarrier, "nobarrier"},  	{Opt_i_version, "i_version"},  	{Opt_stripe, "stripe=%u"}, -	{Opt_resize, "resize"},  	{Opt_delalloc, "delalloc"},  	{Opt_nodelalloc, "nodelalloc"},  	{Opt_mblk_io_submit, "mblk_io_submit"}, @@ -1408,6 +1259,11 @@ static const match_table_t tokens = {  	{Opt_init_itable, "init_itable=%u"},  	{Opt_init_itable, "init_itable"},  	{Opt_noinit_itable, "noinit_itable"}, +	{Opt_removed, "check=none"},	/* mount option from ext2/3 */ +	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */ +	{Opt_removed, "reservation"},	/* mount option from ext2/3 */ +	{Opt_removed, "noreservation"}, /* mount option from ext2/3 */ +	{Opt_removed, "journal=%u"},	/* mount option from ext2/3 */  	{Opt_err, NULL},  }; @@ -1496,420 +1352,273 @@ static int clear_qf_name(struct super_block *sb, int qtype)  }  #endif -static int parse_options(char *options, struct super_block *sb, -			 unsigned long *journal_devnum, -			 unsigned int *journal_ioprio, -			 ext4_fsblk_t *n_blocks_count, int is_remount) -{ -	struct ext4_sb_info *sbi = EXT4_SB(sb); -	char *p; -	substring_t args[MAX_OPT_ARGS]; -	int data_opt = 0; -	int option; +#define MOPT_SET	0x0001 +#define MOPT_CLEAR	0x0002 +#define MOPT_NOSUPPORT	0x0004 +#define MOPT_EXPLICIT	0x0008 +#define MOPT_CLEAR_ERR	0x0010 +#define MOPT_GTE0	0x0020  #ifdef CONFIG_QUOTA -	int qfmt; +#define MOPT_Q		0 +#define MOPT_QFMT	0x0040 +#else +#define MOPT_Q		MOPT_NOSUPPORT +#define MOPT_QFMT	MOPT_NOSUPPORT  #endif +#define MOPT_DATAJ	0x0080 -	if (!options) -		return 1; - -	while ((p = strsep(&options, ",")) != NULL) { -		int token; -		if (!*p) -			continue; - -		/* -		 * Initialize args struct so we know whether arg was -		 * found; some options take optional arguments. -		 */ -		args[0].to = args[0].from = NULL; -		token = match_token(p, tokens, args); -		switch (token) { -		case Opt_bsd_df: -			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); -			clear_opt(sb, MINIX_DF); -			break; -		case Opt_minix_df: -			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); -			set_opt(sb, MINIX_DF); - -			break; -		case Opt_grpid: -			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); -			set_opt(sb, GRPID); - -			break; -		case Opt_nogrpid: -			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); -			clear_opt(sb, GRPID); - -			break; -		case Opt_resuid: -			if (match_int(&args[0], &option)) -				return 0; -			sbi->s_resuid = option; -			break; -		case Opt_resgid: -			if (match_int(&args[0], &option)) -				return 0; -			sbi->s_resgid = option; -			break; -		case Opt_sb: -			/* handled by get_sb_block() instead of here */ -			/* *sb_block = match_int(&args[0]); */ -			break; -		case Opt_err_panic: -			clear_opt(sb, ERRORS_CONT); -			clear_opt(sb, ERRORS_RO); -			set_opt(sb, ERRORS_PANIC); -			break; -		case Opt_err_ro: -			clear_opt(sb, ERRORS_CONT); -			clear_opt(sb, ERRORS_PANIC); -			set_opt(sb, ERRORS_RO); -			break; -		case Opt_err_cont: -			clear_opt(sb, ERRORS_RO); -			clear_opt(sb, ERRORS_PANIC); -			set_opt(sb, ERRORS_CONT); -			break; -		case Opt_nouid32: -			set_opt(sb, NO_UID32); -			break; -		case Opt_debug: -			set_opt(sb, DEBUG); -			break; -		case Opt_oldalloc: -			ext4_msg(sb, KERN_WARNING, -				 "Ignoring deprecated oldalloc option"); -			break; -		case Opt_orlov: -			ext4_msg(sb, KERN_WARNING, -				 "Ignoring deprecated orlov option"); -			break; +static const struct mount_opts { +	int	token; +	int	mount_opt; +	int	flags; +} ext4_mount_opts[] = { +	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, +	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, +	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, +	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, +	{Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET}, +	{Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR}, +	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, +	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, +	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET}, +	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR}, +	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, +	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, +	{Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT}, +	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT}, +	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET}, +	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | +				    EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET}, +	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET}, +	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, +	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, +	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, +	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET}, +	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR}, +	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, +	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, +	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, +	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, +	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, +	{Opt_commit, 0, MOPT_GTE0}, +	{Opt_max_batch_time, 0, MOPT_GTE0}, +	{Opt_min_batch_time, 0, MOPT_GTE0}, +	{Opt_inode_readahead_blks, 0, MOPT_GTE0}, +	{Opt_init_itable, 0, MOPT_GTE0}, +	{Opt_stripe, 0, MOPT_GTE0}, +	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, +	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, +	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},  #ifdef CONFIG_EXT4_FS_XATTR -		case Opt_user_xattr: -			set_opt(sb, XATTR_USER); -			break; -		case Opt_nouser_xattr: -			clear_opt(sb, XATTR_USER); -			break; +	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, +	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},  #else -		case Opt_user_xattr: -		case Opt_nouser_xattr: -			ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported"); -			break; +	{Opt_user_xattr, 0, MOPT_NOSUPPORT}, +	{Opt_nouser_xattr, 0, MOPT_NOSUPPORT},  #endif  #ifdef CONFIG_EXT4_FS_POSIX_ACL -		case Opt_acl: -			set_opt(sb, POSIX_ACL); -			break; -		case Opt_noacl: -			clear_opt(sb, POSIX_ACL); -			break; +	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, +	{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},  #else -		case Opt_acl: -		case Opt_noacl: -			ext4_msg(sb, KERN_ERR, "(no)acl options not supported"); -			break; +	{Opt_acl, 0, MOPT_NOSUPPORT}, +	{Opt_noacl, 0, MOPT_NOSUPPORT},  #endif -		case Opt_journal_update: -			/* @@@ FIXME */ -			/* Eventually we will want to be able to create -			   a journal file here.  For now, only allow the -			   user to specify an existing inode to be the -			   journal file. */ -			if (is_remount) { -				ext4_msg(sb, KERN_ERR, -					 "Cannot specify journal on remount"); -				return 0; -			} -			set_opt(sb, UPDATE_JOURNAL); -			break; -		case Opt_journal_dev: -			if (is_remount) { +	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, +	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, +	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, +	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, +							MOPT_SET | MOPT_Q}, +	{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, +							MOPT_SET | MOPT_Q}, +	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | +		       EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, +	{Opt_usrjquota, 0, MOPT_Q}, +	{Opt_grpjquota, 0, MOPT_Q}, +	{Opt_offusrjquota, 0, MOPT_Q}, +	{Opt_offgrpjquota, 0, MOPT_Q}, +	{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, +	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, +	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, +	{Opt_err, 0, 0} +}; + +static int handle_mount_opt(struct super_block *sb, char *opt, int token, +			    substring_t *args, unsigned long *journal_devnum, +			    unsigned int *journal_ioprio, int is_remount) +{ +	struct ext4_sb_info *sbi = EXT4_SB(sb); +	const struct mount_opts *m; +	int arg = 0; + +	if (args->from && match_int(args, &arg)) +		return -1; +	switch (token) { +	case Opt_noacl: +	case Opt_nouser_xattr: +		ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); +		break; +	case Opt_sb: +		return 1;	/* handled by get_sb_block() */ +	case Opt_removed: +		ext4_msg(sb, KERN_WARNING, +			 "Ignoring removed %s option", opt); +		return 1; +	case Opt_resuid: +		sbi->s_resuid = arg; +		return 1; +	case Opt_resgid: +		sbi->s_resgid = arg; +		return 1; +	case Opt_abort: +		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; +		return 1; +	case Opt_i_version: +		sb->s_flags |= MS_I_VERSION; +		return 1; +	case Opt_journal_dev: +		if (is_remount) { +			ext4_msg(sb, KERN_ERR, +				 "Cannot specify journal on remount"); +			return -1; +		} +		*journal_devnum = arg; +		return 1; +	case Opt_journal_ioprio: +		if (arg < 0 || arg > 7) +			return -1; +		*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); +		return 1; +	} + +	for (m = ext4_mount_opts; m->token != Opt_err; m++) { +		if (token != m->token) +			continue; +		if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) +			return -1; +		if (m->flags & MOPT_EXPLICIT) +			set_opt2(sb, EXPLICIT_DELALLOC); +		if (m->flags & MOPT_CLEAR_ERR) +			clear_opt(sb, ERRORS_MASK); +		if (token == Opt_noquota && sb_any_quota_loaded(sb)) { +			ext4_msg(sb, KERN_ERR, "Cannot change quota " +				 "options when quota turned on"); +			return -1; +		} + +		if (m->flags & MOPT_NOSUPPORT) { +			ext4_msg(sb, KERN_ERR, "%s option not supported", opt); +		} else if (token == Opt_commit) { +			if (arg == 0) +				arg = JBD2_DEFAULT_MAX_COMMIT_AGE; +			sbi->s_commit_interval = HZ * arg; +		} else if (token == Opt_max_batch_time) { +			if (arg == 0) +				arg = EXT4_DEF_MAX_BATCH_TIME; +			sbi->s_max_batch_time = arg; +		} else if (token == Opt_min_batch_time) { +			sbi->s_min_batch_time = arg; +		} else if (token == Opt_inode_readahead_blks) { +			if (arg > (1 << 30)) +				return -1; +			if (arg && !is_power_of_2(arg)) {  				ext4_msg(sb, KERN_ERR, -					"Cannot specify journal on remount"); -				return 0; +					 "EXT4-fs: inode_readahead_blks" +					 " must be a power of 2"); +				return -1;  			} -			if (match_int(&args[0], &option)) -				return 0; -			*journal_devnum = option; -			break; -		case Opt_journal_checksum: -			set_opt(sb, JOURNAL_CHECKSUM); -			break; -		case Opt_journal_async_commit: -			set_opt(sb, JOURNAL_ASYNC_COMMIT); -			set_opt(sb, JOURNAL_CHECKSUM); -			break; -		case Opt_noload: -			set_opt(sb, NOLOAD); -			break; -		case Opt_commit: -			if (match_int(&args[0], &option)) -				return 0; -			if (option < 0) -				return 0; -			if (option == 0) -				option = JBD2_DEFAULT_MAX_COMMIT_AGE; -			sbi->s_commit_interval = HZ * option; -			break; -		case Opt_max_batch_time: -			if (match_int(&args[0], &option)) -				return 0; -			if (option < 0) -				return 0; -			if (option == 0) -				option = EXT4_DEF_MAX_BATCH_TIME; -			sbi->s_max_batch_time = option; -			break; -		case Opt_min_batch_time: -			if (match_int(&args[0], &option)) -				return 0; -			if (option < 0) -				return 0; -			sbi->s_min_batch_time = option; -			break; -		case Opt_data_journal: -			data_opt = EXT4_MOUNT_JOURNAL_DATA; -			goto datacheck; -		case Opt_data_ordered: -			data_opt = EXT4_MOUNT_ORDERED_DATA; -			goto datacheck; -		case Opt_data_writeback: -			data_opt = EXT4_MOUNT_WRITEBACK_DATA; -		datacheck: +			sbi->s_inode_readahead_blks = arg; +		} else if (token == Opt_init_itable) { +			set_opt(sb, INIT_INODE_TABLE); +			if (!args->from) +				arg = EXT4_DEF_LI_WAIT_MULT; +			sbi->s_li_wait_mult = arg; +		} else if (token == Opt_stripe) { +			sbi->s_stripe = arg; +		} else if (m->flags & MOPT_DATAJ) {  			if (is_remount) {  				if (!sbi->s_journal)  					ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); -				else if (test_opt(sb, DATA_FLAGS) != data_opt) { +				else if (test_opt(sb, DATA_FLAGS) != +					 m->mount_opt) {  					ext4_msg(sb, KERN_ERR, -						"Cannot change data mode on remount"); -					return 0; +					 "Cannot change data mode on remount"); +					return -1;  				}  			} else {  				clear_opt(sb, DATA_FLAGS); -				sbi->s_mount_opt |= data_opt; +				sbi->s_mount_opt |= m->mount_opt;  			} -			break; -		case Opt_data_err_abort: -			set_opt(sb, DATA_ERR_ABORT); -			break; -		case Opt_data_err_ignore: -			clear_opt(sb, DATA_ERR_ABORT); -			break;  #ifdef CONFIG_QUOTA -		case Opt_usrjquota: +		} else if (token == Opt_usrjquota) {  			if (!set_qf_name(sb, USRQUOTA, &args[0])) -				return 0; -			break; -		case Opt_grpjquota: +				return -1; +		} else if (token == Opt_grpjquota) {  			if (!set_qf_name(sb, GRPQUOTA, &args[0])) -				return 0; -			break; -		case Opt_offusrjquota: +				return -1; +		} else if (token == Opt_offusrjquota) {  			if (!clear_qf_name(sb, USRQUOTA)) -				return 0; -			break; -		case Opt_offgrpjquota: +				return -1; +		} else if (token == Opt_offgrpjquota) {  			if (!clear_qf_name(sb, GRPQUOTA)) -				return 0; -			break; - -		case Opt_jqfmt_vfsold: -			qfmt = QFMT_VFS_OLD; -			goto set_qf_format; -		case Opt_jqfmt_vfsv0: -			qfmt = QFMT_VFS_V0; -			goto set_qf_format; -		case Opt_jqfmt_vfsv1: -			qfmt = QFMT_VFS_V1; -set_qf_format: +				return -1; +		} else if (m->flags & MOPT_QFMT) {  			if (sb_any_quota_loaded(sb) && -			    sbi->s_jquota_fmt != qfmt) { -				ext4_msg(sb, KERN_ERR, "Cannot change " -					"journaled quota options when " -					"quota turned on"); -				return 0; +			    sbi->s_jquota_fmt != m->mount_opt) { +				ext4_msg(sb, KERN_ERR, "Cannot " +					 "change journaled quota options " +					 "when quota turned on"); +				return -1;  			} -			sbi->s_jquota_fmt = qfmt; -			break; -		case Opt_quota: -		case Opt_usrquota: -			set_opt(sb, QUOTA); -			set_opt(sb, USRQUOTA); -			break; -		case Opt_grpquota: -			set_opt(sb, QUOTA); -			set_opt(sb, GRPQUOTA); -			break; -		case Opt_noquota: -			if (sb_any_quota_loaded(sb)) { -				ext4_msg(sb, KERN_ERR, "Cannot change quota " -					"options when quota turned on"); -				return 0; -			} -			clear_opt(sb, QUOTA); -			clear_opt(sb, USRQUOTA); -			clear_opt(sb, GRPQUOTA); -			break; -#else -		case Opt_quota: -		case Opt_usrquota: -		case Opt_grpquota: -			ext4_msg(sb, KERN_ERR, -				"quota options not supported"); -			break; -		case Opt_usrjquota: -		case Opt_grpjquota: -		case Opt_offusrjquota: -		case Opt_offgrpjquota: -		case Opt_jqfmt_vfsold: -		case Opt_jqfmt_vfsv0: -		case Opt_jqfmt_vfsv1: -			ext4_msg(sb, KERN_ERR, -				"journaled quota options not supported"); -			break; -		case Opt_noquota: -			break; +			sbi->s_jquota_fmt = m->mount_opt;  #endif -		case Opt_abort: -			sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; -			break; -		case Opt_nobarrier: -			clear_opt(sb, BARRIER); -			break; -		case Opt_barrier: -			if (args[0].from) { -				if (match_int(&args[0], &option)) -					return 0; -			} else -				option = 1;	/* No argument, default to 1 */ -			if (option) -				set_opt(sb, BARRIER); -			else -				clear_opt(sb, BARRIER); -			break; -		case Opt_ignore: -			break; -		case Opt_resize: -			if (!is_remount) { -				ext4_msg(sb, KERN_ERR, -					"resize option only available " -					"for remount"); -				return 0; -			} -			if (match_int(&args[0], &option) != 0) -				return 0; -			*n_blocks_count = option; -			break; -		case Opt_nobh: -			ext4_msg(sb, KERN_WARNING, -				 "Ignoring deprecated nobh option"); -			break; -		case Opt_bh: -			ext4_msg(sb, KERN_WARNING, -				 "Ignoring deprecated bh option"); -			break; -		case Opt_i_version: -			set_opt(sb, I_VERSION); -			sb->s_flags |= MS_I_VERSION; -			break; -		case Opt_nodelalloc: -			clear_opt(sb, DELALLOC); -			clear_opt2(sb, EXPLICIT_DELALLOC); -			break; -		case Opt_mblk_io_submit: -			set_opt(sb, MBLK_IO_SUBMIT); -			break; -		case Opt_nomblk_io_submit: -			clear_opt(sb, MBLK_IO_SUBMIT); -			break; -		case Opt_stripe: -			if (match_int(&args[0], &option)) -				return 0; -			if (option < 0) -				return 0; -			sbi->s_stripe = option; -			break; -		case Opt_delalloc: -			set_opt(sb, DELALLOC); -			set_opt2(sb, EXPLICIT_DELALLOC); -			break; -		case Opt_block_validity: -			set_opt(sb, BLOCK_VALIDITY); -			break; -		case Opt_noblock_validity: -			clear_opt(sb, BLOCK_VALIDITY); -			break; -		case Opt_inode_readahead_blks: -			if (match_int(&args[0], &option)) -				return 0; -			if (option < 0 || option > (1 << 30)) -				return 0; -			if (option && !is_power_of_2(option)) { -				ext4_msg(sb, KERN_ERR, -					 "EXT4-fs: inode_readahead_blks" -					 " must be a power of 2"); -				return 0; +		} else { +			if (!args->from) +				arg = 1; +			if (m->flags & MOPT_CLEAR) +				arg = !arg; +			else if (unlikely(!(m->flags & MOPT_SET))) { +				ext4_msg(sb, KERN_WARNING, +					 "buggy handling of option %s", opt); +				WARN_ON(1); +				return -1;  			} -			sbi->s_inode_readahead_blks = option; -			break; -		case Opt_journal_ioprio: -			if (match_int(&args[0], &option)) -				return 0; -			if (option < 0 || option > 7) -				break; -			*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, -							    option); -			break; -		case Opt_noauto_da_alloc: -			set_opt(sb, NO_AUTO_DA_ALLOC); -			break; -		case Opt_auto_da_alloc: -			if (args[0].from) { -				if (match_int(&args[0], &option)) -					return 0; -			} else -				option = 1;	/* No argument, default to 1 */ -			if (option) -				clear_opt(sb, NO_AUTO_DA_ALLOC); +			if (arg != 0) +				sbi->s_mount_opt |= m->mount_opt;  			else -				set_opt(sb,NO_AUTO_DA_ALLOC); -			break; -		case Opt_discard: -			set_opt(sb, DISCARD); -			break; -		case Opt_nodiscard: -			clear_opt(sb, DISCARD); -			break; -		case Opt_dioread_nolock: -			set_opt(sb, DIOREAD_NOLOCK); -			break; -		case Opt_dioread_lock: -			clear_opt(sb, DIOREAD_NOLOCK); -			break; -		case Opt_init_itable: -			set_opt(sb, INIT_INODE_TABLE); -			if (args[0].from) { -				if (match_int(&args[0], &option)) -					return 0; -			} else -				option = EXT4_DEF_LI_WAIT_MULT; -			if (option < 0) -				return 0; -			sbi->s_li_wait_mult = option; -			break; -		case Opt_noinit_itable: -			clear_opt(sb, INIT_INODE_TABLE); -			break; -		default: -			ext4_msg(sb, KERN_ERR, -			       "Unrecognized mount option \"%s\" " -			       "or missing value", p); -			return 0; +				sbi->s_mount_opt &= ~m->mount_opt;  		} +		return 1; +	} +	ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " +		 "or missing value", opt); +	return -1; +} + +static int parse_options(char *options, struct super_block *sb, +			 unsigned long *journal_devnum, +			 unsigned int *journal_ioprio, +			 int is_remount) +{ +	struct ext4_sb_info *sbi = EXT4_SB(sb); +	char *p; +	substring_t args[MAX_OPT_ARGS]; +	int token; + +	if (!options) +		return 1; + +	while ((p = strsep(&options, ",")) != NULL) { +		if (!*p) +			continue; +		/* +		 * Initialize args struct so we know whether arg was +		 * found; some options take optional arguments. +		 */ +		args[0].to = args[0].from = 0; +		token = match_token(p, tokens, args); +		if (handle_mount_opt(sb, p, token, args, journal_devnum, +				     journal_ioprio, is_remount) < 0) +			return 0;  	}  #ifdef CONFIG_QUOTA  	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { @@ -1942,6 +1651,160 @@ set_qf_format:  	return 1;  } +static inline void ext4_show_quota_options(struct seq_file *seq, +					   struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) +	struct ext4_sb_info *sbi = EXT4_SB(sb); + +	if (sbi->s_jquota_fmt) { +		char *fmtname = ""; + +		switch (sbi->s_jquota_fmt) { +		case QFMT_VFS_OLD: +			fmtname = "vfsold"; +			break; +		case QFMT_VFS_V0: +			fmtname = "vfsv0"; +			break; +		case QFMT_VFS_V1: +			fmtname = "vfsv1"; +			break; +		} +		seq_printf(seq, ",jqfmt=%s", fmtname); +	} + +	if (sbi->s_qf_names[USRQUOTA]) +		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + +	if (sbi->s_qf_names[GRPQUOTA]) +		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + +	if (test_opt(sb, USRQUOTA)) +		seq_puts(seq, ",usrquota"); + +	if (test_opt(sb, GRPQUOTA)) +		seq_puts(seq, ",grpquota"); +#endif +} + +static const char *token2str(int token) +{ +	static const struct match_token *t; + +	for (t = tokens; t->token != Opt_err; t++) +		if (t->token == token && !strchr(t->pattern, '=')) +			break; +	return t->pattern; +} + +/* + * Show an option if + *  - it's set to a non-default value OR + *  - if the per-sb default is different from the global default + */ +static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, +			      int nodefs) +{ +	struct ext4_sb_info *sbi = EXT4_SB(sb); +	struct ext4_super_block *es = sbi->s_es; +	int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt; +	const struct mount_opts *m; +	char sep = nodefs ? '\n' : ','; + +#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) +#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) + +	if (sbi->s_sb_block != 1) +		SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); + +	for (m = ext4_mount_opts; m->token != Opt_err; m++) { +		int want_set = m->flags & MOPT_SET; +		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || +		    (m->flags & MOPT_CLEAR_ERR)) +			continue; +		if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) +			continue; /* skip if same as the default */ +		if ((want_set && +		     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || +		    (!want_set && (sbi->s_mount_opt & m->mount_opt))) +			continue; /* select Opt_noFoo vs Opt_Foo */ +		SEQ_OPTS_PRINT("%s", token2str(m->token)); +	} + +	if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID || +	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) +		SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid); +	if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID || +	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) +		SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid); +	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); +	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) +		SEQ_OPTS_PUTS("errors=remount-ro"); +	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) +		SEQ_OPTS_PUTS("errors=continue"); +	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) +		SEQ_OPTS_PUTS("errors=panic"); +	if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) +		SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); +	if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) +		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); +	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) +		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); +	if (sb->s_flags & MS_I_VERSION) +		SEQ_OPTS_PUTS("i_version"); +	if (nodefs || sbi->s_stripe) +		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); +	if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) { +		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) +			SEQ_OPTS_PUTS("data=journal"); +		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) +			SEQ_OPTS_PUTS("data=ordered"); +		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) +			SEQ_OPTS_PUTS("data=writeback"); +	} +	if (nodefs || +	    sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) +		SEQ_OPTS_PRINT("inode_readahead_blks=%u", +			       sbi->s_inode_readahead_blks); + +	if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && +		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) +		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); + +	ext4_show_quota_options(seq, sb); +	return 0; +} + +static int ext4_show_options(struct seq_file *seq, struct dentry *root) +{ +	return _ext4_show_options(seq, root->d_sb, 0); +} + +static int options_seq_show(struct seq_file *seq, void *offset) +{ +	struct super_block *sb = seq->private; +	int rc; + +	seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw"); +	rc = _ext4_show_options(seq, sb, 1); +	seq_puts(seq, "\n"); +	return rc; +} + +static int options_open_fs(struct inode *inode, struct file *file) +{ +	return single_open(file, options_seq_show, PDE(inode)->data); +} + +static const struct file_operations ext4_seq_options_fops = { +	.owner = THIS_MODULE, +	.open = options_open_fs, +	.read = seq_read, +	.llseek = seq_lseek, +	.release = single_release, +}; +  static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,  			    int read_only)  { @@ -2945,7 +2808,7 @@ static int ext4_run_lazyinit_thread(void)  		ext4_clear_request_list();  		kfree(ext4_li_info);  		ext4_li_info = NULL; -		printk(KERN_CRIT "EXT4: error %d creating inode table " +		printk(KERN_CRIT "EXT4-fs: error %d creating inode table "  				 "initialization thread\n",  				 err);  		return err; @@ -3183,11 +3046,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	set_opt(sb, INIT_INODE_TABLE);  	if (def_mount_opts & EXT4_DEFM_DEBUG)  		set_opt(sb, DEBUG); -	if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { -		ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", -			"2.6.38"); +	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)  		set_opt(sb, GRPID); -	}  	if (def_mount_opts & EXT4_DEFM_UID16)  		set_opt(sb, NO_UID32);  	/* xattr user namespace & acls are now defaulted on */ @@ -3240,13 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;  	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, -			   &journal_devnum, &journal_ioprio, NULL, 0)) { +			   &journal_devnum, &journal_ioprio, 0)) {  		ext4_msg(sb, KERN_WARNING,  			 "failed to parse options in superblock: %s",  			 sbi->s_es->s_mount_opts);  	} +	sbi->s_def_mount_opt = sbi->s_mount_opt;  	if (!parse_options((char *) data, sb, &journal_devnum, -			   &journal_ioprio, NULL, 0)) +			   &journal_ioprio, 0))  		goto failed_mount;  	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { @@ -3416,7 +3277,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  #else  		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);  #endif -		sb->s_dirt = 1;  	}  	/* Handle clustersize */ @@ -3540,6 +3400,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	if (ext4_proc_root)  		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); +	if (sbi->s_proc) +		proc_create_data("options", S_IRUGO, sbi->s_proc, +				 &ext4_seq_options_fops, sb); +  	bgl_lock_init(sbi->s_blockgroup_lock);  	for (i = 0; i < db_count; i++) { @@ -3694,6 +3558,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)  	}  	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); +	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; +  	/*  	 * The journal may have updated the bg summary counts, so we  	 * need to update the global counters. @@ -3735,9 +3601,8 @@ no_journal:  		iput(root);  		goto failed_mount4;  	} -	sb->s_root = d_alloc_root(root); +	sb->s_root = d_make_root(root);  	if (!sb->s_root) { -		iput(root);  		ext4_msg(sb, KERN_ERR, "get root dentry failed");  		ret = -ENOMEM;  		goto failed_mount4; @@ -3862,6 +3727,7 @@ failed_mount2:  	ext4_kvfree(sbi->s_group_desc);  failed_mount:  	if (sbi->s_proc) { +		remove_proc_entry("options", sbi->s_proc);  		remove_proc_entry(sb->s_id, ext4_proc_root);  	}  #ifdef CONFIG_QUOTA @@ -4091,15 +3957,6 @@ static int ext4_load_journal(struct super_block *sb,  	if (!(journal->j_flags & JBD2_BARRIER))  		ext4_msg(sb, KERN_INFO, "barriers disabled"); -	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { -		err = jbd2_journal_update_format(journal); -		if (err)  { -			ext4_msg(sb, KERN_ERR, "error updating journal"); -			jbd2_journal_destroy(journal); -			return err; -		} -	} -  	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))  		err = jbd2_journal_wipe(journal, !really_read_only);  	if (!err) { @@ -4386,7 +4243,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)  {  	struct ext4_super_block *es;  	struct ext4_sb_info *sbi = EXT4_SB(sb); -	ext4_fsblk_t n_blocks_count = 0;  	unsigned long old_sb_flags;  	struct ext4_mount_options old_opts;  	int enable_quota = 0; @@ -4419,8 +4275,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)  	/*  	 * Allow the "check" option to be passed as a remount option.  	 */ -	if (!parse_options(data, sb, NULL, &journal_ioprio, -			   &n_blocks_count, 1)) { +	if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {  		err = -EINVAL;  		goto restore_opts;  	} @@ -4438,8 +4293,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)  		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);  	} -	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || -		n_blocks_count > ext4_blocks_count(es)) { +	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {  		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {  			err = -EROFS;  			goto restore_opts; @@ -4514,8 +4368,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)  			if (sbi->s_journal)  				ext4_clear_journal_err(sb, es);  			sbi->s_mount_state = le16_to_cpu(es->s_state); -			if ((err = ext4_group_extend(sb, es, n_blocks_count))) -				goto restore_opts;  			if (!ext4_setup_super(sb, es, 0))  				sb->s_flags &= ~MS_RDONLY;  			if (EXT4_HAS_INCOMPAT_FEATURE(sb, @@ -5056,6 +4908,9 @@ static int __init ext4_init_fs(void)  {  	int i, err; +	ext4_li_info = NULL; +	mutex_init(&ext4_li_mtx); +  	ext4_check_flag_values();  	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { @@ -5094,8 +4949,6 @@ static int __init ext4_init_fs(void)  	if (err)  		goto out; -	ext4_li_info = NULL; -	mutex_init(&ext4_li_mtx);  	return 0;  out:  	unregister_as_ext2(); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 93a00d89a22..e88748e55c0 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -82,8 +82,8 @@  		printk("\n"); \  	} while (0)  #else -# define ea_idebug(f...) -# define ea_bdebug(f...) +# define ea_idebug(inode, fmt, ...)	no_printk(fmt, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)  #endif  static void ext4_xattr_cache_insert(struct buffer_head *); @@ -158,13 +158,10 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)  static inline int  ext4_xattr_check_block(struct buffer_head *bh)  { -	int error; -  	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||  	    BHDR(bh)->h_blocks != cpu_to_le32(1))  		return -EIO; -	error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); -	return error; +	return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);  }  static inline int @@ -220,7 +217,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,  	error = -ENODATA;  	if (!EXT4_I(inode)->i_file_acl)  		goto cleanup; -	ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); +	ea_idebug(inode, "reading block %llu", +		  (unsigned long long)EXT4_I(inode)->i_file_acl);  	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);  	if (!bh)  		goto cleanup; @@ -363,7 +361,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)  	error = 0;  	if (!EXT4_I(inode)->i_file_acl)  		goto cleanup; -	ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); +	ea_idebug(inode, "reading block %llu", +		  (unsigned long long)EXT4_I(inode)->i_file_acl);  	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);  	error = -EIO;  	if (!bh) @@ -487,18 +486,19 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,  		ext4_free_blocks(handle, inode, bh, 0, 1,  				 EXT4_FREE_BLOCKS_METADATA |  				 EXT4_FREE_BLOCKS_FORGET); +		unlock_buffer(bh);  	} else {  		le32_add_cpu(&BHDR(bh)->h_refcount, -1); +		if (ce) +			mb_cache_entry_release(ce); +		unlock_buffer(bh);  		error = ext4_handle_dirty_metadata(handle, inode, bh);  		if (IS_SYNC(inode))  			ext4_handle_sync(handle);  		dquot_free_block(inode, 1);  		ea_bdebug(bh, "refcount now=%d; releasing",  			  le32_to_cpu(BHDR(bh)->h_refcount)); -		if (ce) -			mb_cache_entry_release(ce);  	} -	unlock_buffer(bh);  out:  	ext4_std_error(inode->i_sb, error);  	return; @@ -834,7 +834,8 @@ inserted:  			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))  				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); -			ea_idebug(inode, "creating block %d", block); +			ea_idebug(inode, "creating block %llu", +				  (unsigned long long)block);  			new_bh = sb_getblk(sb, block);  			if (!new_bh) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 3ab841054d5..21687e31acc 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1496,11 +1496,13 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,  	root_inode->i_ino = MSDOS_ROOT_INO;  	root_inode->i_version = 1;  	error = fat_read_root(root_inode); -	if (error < 0) +	if (error < 0) { +		iput(root_inode);  		goto out_fail; +	}  	error = -ENOMEM;  	insert_inode_hash(root_inode); -	sb->s_root = d_alloc_root(root_inode); +	sb->s_root = d_make_root(root_inode);  	if (!sb->s_root) {  		fat_msg(sb, KERN_ERR, "get root inode failed");  		goto out_fail; @@ -1516,8 +1518,6 @@ out_invalid:  out_fail:  	if (fat_inode)  		iput(fat_inode); -	if (root_inode) -		iput(root_inode);  	unload_nls(sbi->nls_io);  	unload_nls(sbi->nls_disk);  	if (sbi->options.iocharset != fat_default_iocharset) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index a81eb2367d3..98ae804f527 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -521,57 +521,46 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,  		op = &outname[*outlen * sizeof(wchar_t)];  	} else { -		if (nls) { -			for (i = 0, ip = name, op = outname, *outlen = 0; -			     i < len && *outlen <= FAT_LFN_LEN; -			     *outlen += 1) -			{ -				if (escape && (*ip == ':')) { -					if (i > len - 5) -						return -EINVAL; -					ec = 0; -					for (k = 1; k < 5; k++) { -						nc = ip[k]; -						ec <<= 4; -						if (nc >= '0' && nc <= '9') { -							ec |= nc - '0'; -							continue; -						} -						if (nc >= 'a' && nc <= 'f') { -							ec |= nc - ('a' - 10); -							continue; -						} -						if (nc >= 'A' && nc <= 'F') { -							ec |= nc - ('A' - 10); -							continue; -						} -						return -EINVAL; +		for (i = 0, ip = name, op = outname, *outlen = 0; +			 i < len && *outlen < FAT_LFN_LEN; +			 *outlen += 1) { +			if (escape && (*ip == ':')) { +				if (i > len - 5) +					return -EINVAL; +				ec = 0; +				for (k = 1; k < 5; k++) { +					nc = ip[k]; +					ec <<= 4; +					if (nc >= '0' && nc <= '9') { +						ec |= nc - '0'; +						continue;  					} -					*op++ = ec & 0xFF; -					*op++ = ec >> 8; -					ip += 5; -					i += 5; -				} else { -					if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0) -						return -EINVAL; -					ip += charlen; -					i += charlen; -					op += 2; +					if (nc >= 'a' && nc <= 'f') { +						ec |= nc - ('a' - 10); +						continue; +					} +					if (nc >= 'A' && nc <= 'F') { +						ec |= nc - ('A' - 10); +						continue; +					} +					return -EINVAL;  				} +				*op++ = ec & 0xFF; +				*op++ = ec >> 8; +				ip += 5; +				i += 5; +			} else { +				charlen = nls->char2uni(ip, len - i, +									(wchar_t *)op); +				if (charlen < 0) +					return -EINVAL; +				ip += charlen; +				i += charlen; +				op += 2;  			} -			if (i < len) -				return -ENAMETOOLONG; -		} else { -			for (i = 0, ip = name, op = outname, *outlen = 0; -			     i < len && *outlen <= FAT_LFN_LEN; -			     i++, *outlen += 1) -			{ -				*op++ = *ip++; -				*op++ = 0; -			} -			if (i < len) -				return -ENAMETOOLONG;  		} +		if (i < len) +			return -ENAMETOOLONG;  	}  	*longlen = *outlen; diff --git a/fs/file.c b/fs/file.c index 4c6992d8f3b..3c426de7203 100644 --- a/fs/file.c +++ b/fs/file.c @@ -6,7 +6,7 @@   *  Manage the dynamic fd arrays in the process files_struct.   */ -#include <linux/module.h> +#include <linux/export.h>  #include <linux/fs.h>  #include <linux/mm.h>  #include <linux/mmzone.h> diff --git a/fs/file_table.c b/fs/file_table.c index 20002e39754..70f2a0fd6ae 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -204,7 +204,7 @@ EXPORT_SYMBOL(alloc_file);   * to write to @file, along with access to write through   * its vfsmount.   */ -void drop_file_write_access(struct file *file) +static void drop_file_write_access(struct file *file)  {  	struct vfsmount *mnt = file->f_path.mnt;  	struct dentry *dentry = file->f_path.dentry; @@ -219,7 +219,6 @@ void drop_file_write_access(struct file *file)  	mnt_drop_write(mnt);  	file_release_write(file);  } -EXPORT_SYMBOL_GPL(drop_file_write_access);  /* the real guts of fput() - releasing the last reference to file   */ diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 9d1c9955838..d4fabd26084 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -224,9 +224,8 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)  		ret = PTR_ERR(root);  		goto out;  	} -	sbp->s_root = d_alloc_root(root); +	sbp->s_root = d_make_root(root);  	if (!sbp->s_root) { -		iput(root);  		printk(KERN_WARNING "vxfs: unable to get root dentry.\n");  		goto out_free_ilist;  	} diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f855916657b..539f36cf3e4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -14,7 +14,7 @@   */  #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/spinlock.h>  #include <linux/slab.h>  #include <linux/sched.h> @@ -53,14 +53,6 @@ struct wb_writeback_work {  };  /* - * Include the creation of the trace points after defining the - * wb_writeback_work structure so that the definition remains local to this - * file. - */ -#define CREATE_TRACE_POINTS -#include <trace/events/writeback.h> - -/*   * We don't actually have pdflush, but this one is exported though /proc...   */  int nr_pdflush_threads; @@ -92,6 +84,14 @@ static inline struct inode *wb_inode(struct list_head *head)  	return list_entry(head, struct inode, i_wb_list);  } +/* + * Include the creation of the trace points after defining the + * wb_writeback_work structure and inline functions so that the definition + * remains local to this file. + */ +#define CREATE_TRACE_POINTS +#include <trace/events/writeback.h> +  /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */  static void bdi_wakeup_flusher(struct backing_dev_info *bdi)  { @@ -256,7 +256,8 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)  }  /* - * Move expired dirty inodes from @delaying_queue to @dispatch_queue. + * Move expired (dirtied after work->older_than_this) dirty inodes from + * @delaying_queue to @dispatch_queue.   */  static int move_expired_inodes(struct list_head *delaying_queue,  			       struct list_head *dispatch_queue, @@ -1148,23 +1149,6 @@ out_unlock_inode:  }  EXPORT_SYMBOL(__mark_inode_dirty); -/* - * Write out a superblock's list of dirty inodes.  A wait will be performed - * upon no inodes, all inodes or the final one, depending upon sync_mode. - * - * If older_than_this is non-NULL, then only write out inodes which - * had their first dirtying at a time earlier than *older_than_this. - * - * If `bdi' is non-zero then we're being asked to writeback a specific queue. - * This function assumes that the blockdev superblock's inodes are backed by - * a variety of queues, so all inodes are searched.  For other superblocks, - * assume that all inodes are backed by the same queue. - * - * The inodes to be written are parked on bdi->b_io.  They are moved back onto - * bdi->b_dirty as they are selected for writing.  This way, none can be missed - * on the writer throttling path, and we get decent balancing between many - * throttled threads: we don't want them all piling up on inode_sync_wait. - */  static void wait_sb_inodes(struct super_block *sb)  {  	struct inode *inode, *old_inode = NULL; @@ -1284,7 +1268,7 @@ int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)  EXPORT_SYMBOL(writeback_inodes_sb_if_idle);  /** - * writeback_inodes_sb_if_idle	-	start writeback if none underway + * writeback_inodes_sb_nr_if_idle	-	start writeback if none underway   * @sb: the superblock   * @nr: the number of pages to write   * @reason: reason why some writeback work was initiated @@ -1364,8 +1348,6 @@ int write_inode_now(struct inode *inode, int sync)  	ret = writeback_single_inode(inode, wb, &wbc);  	spin_unlock(&inode->i_lock);  	spin_unlock(&wb->list_lock); -	if (sync) -		inode_sync_wait(inode);  	return ret;  }  EXPORT_SYMBOL(write_inode_now); diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 78b519c1353..e159e682ad4 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -1,4 +1,4 @@ -#include <linux/module.h> +#include <linux/export.h>  #include <linux/sched.h>  #include <linux/fs.h>  #include <linux/path.h> @@ -26,11 +26,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)  {  	struct path old_root; +	path_get_longterm(path);  	spin_lock(&fs->lock);  	write_seqcount_begin(&fs->seq);  	old_root = fs->root;  	fs->root = *path; -	path_get_longterm(path);  	write_seqcount_end(&fs->seq);  	spin_unlock(&fs->lock);  	if (old_root.dentry) @@ -45,11 +45,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)  {  	struct path old_pwd; +	path_get_longterm(path);  	spin_lock(&fs->lock);  	write_seqcount_begin(&fs->seq);  	old_pwd = fs->pwd;  	fs->pwd = *path; -	path_get_longterm(path);  	write_seqcount_end(&fs->seq);  	spin_unlock(&fs->lock); @@ -57,6 +57,14 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)  		path_put_longterm(&old_pwd);  } +static inline int replace_path(struct path *p, const struct path *old, const struct path *new) +{ +	if (likely(p->dentry != old->dentry || p->mnt != old->mnt)) +		return 0; +	*p = *new; +	return 1; +} +  void chroot_fs_refs(struct path *old_root, struct path *new_root)  {  	struct task_struct *g, *p; @@ -68,21 +76,16 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)  		task_lock(p);  		fs = p->fs;  		if (fs) { +			int hits = 0;  			spin_lock(&fs->lock);  			write_seqcount_begin(&fs->seq); -			if (fs->root.dentry == old_root->dentry -			    && fs->root.mnt == old_root->mnt) { -				path_get_longterm(new_root); -				fs->root = *new_root; +			hits += replace_path(&fs->root, old_root, new_root); +			hits += replace_path(&fs->pwd, old_root, new_root); +			write_seqcount_end(&fs->seq); +			while (hits--) {  				count++; -			} -			if (fs->pwd.dentry == old_root->dentry -			    && fs->pwd.mnt == old_root->mnt) {  				path_get_longterm(new_root); -				fs->pwd = *new_root; -				count++;  			} -			write_seqcount_end(&fs->seq);  			spin_unlock(&fs->lock);  		}  		task_unlock(p); @@ -107,10 +110,8 @@ void exit_fs(struct task_struct *tsk)  		int kill;  		task_lock(tsk);  		spin_lock(&fs->lock); -		write_seqcount_begin(&fs->seq);  		tsk->fs = NULL;  		kill = !--fs->users; -		write_seqcount_end(&fs->seq);  		spin_unlock(&fs->lock);  		task_unlock(tsk);  		if (kill) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5f3368ab0fa..7df2b5e8fbe 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -838,10 +838,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,  			}  		}  		if (page) { -			void *mapaddr = kmap_atomic(page, KM_USER0); +			void *mapaddr = kmap_atomic(page);  			void *buf = mapaddr + offset;  			offset += fuse_copy_do(cs, &buf, &count); -			kunmap_atomic(mapaddr, KM_USER0); +			kunmap_atomic(mapaddr);  		} else  			offset += fuse_copy_do(cs, NULL, &count);  	} diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4a199fd93fb..a841868bf9c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1887,11 +1887,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,  		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)  			goto out; -		vaddr = kmap_atomic(pages[0], KM_USER0); +		vaddr = kmap_atomic(pages[0]);  		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,  					    transferred, in_iovs + out_iovs,  					    (flags & FUSE_IOCTL_COMPAT) != 0); -		kunmap_atomic(vaddr, KM_USER0); +		kunmap_atomic(vaddr);  		if (err)  			goto out; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 64cf8d07393..4aec5995867 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -988,14 +988,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)  	err = -ENOMEM;  	root = fuse_get_root_inode(sb, d.rootmode); -	if (!root) +	root_dentry = d_make_root(root); +	if (!root_dentry)  		goto err_put_conn; - -	root_dentry = d_alloc_root(root); -	if (!root_dentry) { -		iput(root); -		goto err_put_conn; -	}  	/* only now - we want root dentry with NULL ->d_op */  	sb->s_d_op = &fuse_dentry_operations; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 501e5cba09b..38b7a74a0f9 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -434,12 +434,12 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)  	if (error)  		return error; -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))  		dsize = (dibh->b_size - sizeof(struct gfs2_dinode));  	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);  	memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	flush_dcache_page(page);  	brelse(dibh);  	SetPageUptodate(page); @@ -542,9 +542,9 @@ int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,  		page = read_cache_page(mapping, index, __gfs2_readpage, NULL);  		if (IS_ERR(page))  			return PTR_ERR(page); -		p = kmap_atomic(page, KM_USER0); +		p = kmap_atomic(page);  		memcpy(buf + copied, p + offset, amt); -		kunmap_atomic(p, KM_USER0); +		kunmap_atomic(p);  		mark_page_accessed(page);  		page_cache_release(page);  		copied += amt; @@ -788,11 +788,11 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,  	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);  	BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	memcpy(buf + pos, kaddr + pos, copied);  	memset(kaddr + pos + copied, 0, len - copied);  	flush_dcache_page(page); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	if (!PageUptodate(page))  		SetPageUptodate(page); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 14a70401597..197c5c47e57 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -60,7 +60,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,  	int release = 0;  	if (!page || page->index) { -		page = grab_cache_page(inode->i_mapping, 0); +		page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);  		if (!page)  			return -ENOMEM;  		release = 1; @@ -930,7 +930,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)  	struct page *page;  	int err; -	page = grab_cache_page(mapping, index); +	page = find_or_create_page(mapping, index, GFP_NOFS);  	if (!page)  		return 0; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c5fb3597f69..76834587a8a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -313,6 +313,8 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)  		return gfs2_get_flags(filp, (u32 __user *)arg);  	case FS_IOC_SETFLAGS:  		return gfs2_set_flags(filp, (u32 __user *)arg); +	case FITRIM: +		return gfs2_fitrim(filp, (void __user *)arg);  	}  	return -ENOTTY;  } @@ -674,6 +676,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,  	struct gfs2_inode *ip = GFS2_I(inode);  	struct buffer_head *dibh;  	int error; +	loff_t size = len;  	unsigned int nr_blks;  	sector_t lblock = offset >> inode->i_blkbits; @@ -707,8 +710,8 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,  			goto out;  		}  	} -	if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) -		i_size_write(inode, offset + len); +	if (offset + size > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) +		i_size_write(inode, offset + size);  	mark_inode_dirty(inode); @@ -777,12 +780,14 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,  	if (unlikely(error))  		goto out_uninit; -	if (!gfs2_write_alloc_required(ip, offset, len)) -		goto out_unlock; -  	while (len > 0) {  		if (len < bytes)  			bytes = len; +		if (!gfs2_write_alloc_required(ip, offset, bytes)) { +			len -= bytes; +			offset += bytes; +			continue; +		}  		qa = gfs2_qadata_get(ip);  		if (!qa) {  			error = -ENOMEM; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 376816fcd04..dab2526071c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -29,6 +29,7 @@  #include <linux/rcupdate.h>  #include <linux/rculist_bl.h>  #include <linux/bit_spinlock.h> +#include <linux/percpu.h>  #include "gfs2.h"  #include "incore.h" @@ -167,14 +168,19 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)  	spin_unlock(&lru_lock);  } -static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl)  { -	spin_lock(&lru_lock);  	if (!list_empty(&gl->gl_lru)) {  		list_del_init(&gl->gl_lru);  		atomic_dec(&lru_count);  		clear_bit(GLF_LRU, &gl->gl_flags);  	} +} + +static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +{ +	spin_lock(&lru_lock); +	__gfs2_glock_remove_from_lru(gl);  	spin_unlock(&lru_lock);  } @@ -217,11 +223,12 @@ void gfs2_glock_put(struct gfs2_glock *gl)  	struct gfs2_sbd *sdp = gl->gl_sbd;  	struct address_space *mapping = gfs2_glock2aspace(gl); -	if (atomic_dec_and_test(&gl->gl_ref)) { +	if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) { +		__gfs2_glock_remove_from_lru(gl); +		spin_unlock(&lru_lock);  		spin_lock_bucket(gl->gl_hash);  		hlist_bl_del_rcu(&gl->gl_list);  		spin_unlock_bucket(gl->gl_hash); -		gfs2_glock_remove_from_lru(gl);  		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));  		GLOCK_BUG_ON(gl, mapping && mapping->nrpages);  		trace_gfs2_glock_put(gl); @@ -537,6 +544,11 @@ __acquires(&gl->gl_spin)  		do_error(gl, 0); /* Fail queued try locks */  	}  	gl->gl_req = target; +	set_bit(GLF_BLOCKING, &gl->gl_flags); +	if ((gl->gl_req == LM_ST_UNLOCKED) || +	    (gl->gl_state == LM_ST_EXCLUSIVE) || +	    (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) +		clear_bit(GLF_BLOCKING, &gl->gl_flags);  	spin_unlock(&gl->gl_spin);  	if (glops->go_xmote_th)  		glops->go_xmote_th(gl); @@ -738,6 +750,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,  		return -ENOMEM;  	atomic_inc(&sdp->sd_glock_disposal); +	gl->gl_sbd = sdp;  	gl->gl_flags = 0;  	gl->gl_name = name;  	atomic_set(&gl->gl_ref, 1); @@ -746,12 +759,17 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,  	gl->gl_demote_state = LM_ST_EXCLUSIVE;  	gl->gl_hash = hash;  	gl->gl_ops = glops; -	snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number); +	gl->gl_dstamp = ktime_set(0, 0); +	preempt_disable(); +	/* We use the global stats to estimate the initial per-glock stats */ +	gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type]; +	preempt_enable(); +	gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; +	gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0;  	memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));  	gl->gl_lksb.sb_lvbptr = gl->gl_lvb;  	gl->gl_tchange = jiffies;  	gl->gl_object = NULL; -	gl->gl_sbd = sdp;  	gl->gl_hold_time = GL_GLOCK_DFT_HOLD;  	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);  	INIT_WORK(&gl->gl_delete, delete_work_func); @@ -993,6 +1011,8 @@ fail:  	}  	set_bit(GLF_QUEUED, &gl->gl_flags);  	trace_gfs2_glock_queue(gh, 1); +	gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); +	gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT);  	if (likely(insert_pt == NULL)) {  		list_add_tail(&gh->gh_list, &gl->gl_holders);  		if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) @@ -1652,6 +1672,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)  		*p++ = 'L';  	if (gl->gl_object)  		*p++ = 'o'; +	if (test_bit(GLF_BLOCKING, gflags)) +		*p++ = 'b';  	*p = 0;  	return buf;  } @@ -1708,8 +1730,78 @@ out:  	return error;  } +static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) +{ +	struct gfs2_glock *gl = iter_ptr; +	seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n", +		   gl->gl_name.ln_type, +		   (unsigned long long)gl->gl_name.ln_number, +		   (long long)gl->gl_stats.stats[GFS2_LKS_SRTT], +		   (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR], +		   (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB], +		   (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB], +		   (long long)gl->gl_stats.stats[GFS2_LKS_SIRT], +		   (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR], +		   (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT], +		   (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]); +	return 0; +} + +static const char *gfs2_gltype[] = { +	"type", +	"reserved", +	"nondisk", +	"inode", +	"rgrp", +	"meta", +	"iopen", +	"flock", +	"plock", +	"quota", +	"journal", +}; + +static const char *gfs2_stype[] = { +	[GFS2_LKS_SRTT]		= "srtt", +	[GFS2_LKS_SRTTVAR]	= "srttvar", +	[GFS2_LKS_SRTTB]	= "srttb", +	[GFS2_LKS_SRTTVARB]	= "srttvarb", +	[GFS2_LKS_SIRT]		= "sirt", +	[GFS2_LKS_SIRTVAR]	= "sirtvar", +	[GFS2_LKS_DCOUNT]	= "dlm", +	[GFS2_LKS_QCOUNT]	= "queue", +}; + +#define GFS2_NR_SBSTATS (ARRAY_SIZE(gfs2_gltype) * ARRAY_SIZE(gfs2_stype)) + +static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) +{ +	struct gfs2_glock_iter *gi = seq->private; +	struct gfs2_sbd *sdp = gi->sdp; +	unsigned index = gi->hash >> 3; +	unsigned subindex = gi->hash & 0x07; +	s64 value; +	int i; + +	if (index == 0 && subindex != 0) +		return 0; +	seq_printf(seq, "%-10s %8s:", gfs2_gltype[index], +		   (index == 0) ? "cpu": gfs2_stype[subindex]); + +	for_each_possible_cpu(i) { +                const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i); +		if (index == 0) { +			value = i; +		} else { +			value = lkstats->lkstats[index - 1].stats[subindex]; +		} +		seq_printf(seq, " %15lld", (long long)value); +	} +	seq_putc(seq, '\n'); +	return 0; +}  int __init gfs2_glock_init(void)  { @@ -1822,6 +1914,35 @@ static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)  	return dump_glock(seq, iter_ptr);  } +static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) +{ +	struct gfs2_glock_iter *gi = seq->private; + +	gi->hash = *pos; +	if (*pos >= GFS2_NR_SBSTATS) +		return NULL; +	preempt_disable(); +	return SEQ_START_TOKEN; +} + +static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr, +				   loff_t *pos) +{ +	struct gfs2_glock_iter *gi = seq->private; +	(*pos)++; +	gi->hash++; +	if (gi->hash >= GFS2_NR_SBSTATS) { +		preempt_enable(); +		return NULL; +	} +	return SEQ_START_TOKEN; +} + +static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr) +{ +	preempt_enable(); +} +  static const struct seq_operations gfs2_glock_seq_ops = {  	.start = gfs2_glock_seq_start,  	.next  = gfs2_glock_seq_next, @@ -1829,7 +1950,21 @@ static const struct seq_operations gfs2_glock_seq_ops = {  	.show  = gfs2_glock_seq_show,  }; -static int gfs2_debugfs_open(struct inode *inode, struct file *file) +static const struct seq_operations gfs2_glstats_seq_ops = { +	.start = gfs2_glock_seq_start, +	.next  = gfs2_glock_seq_next, +	.stop  = gfs2_glock_seq_stop, +	.show  = gfs2_glstats_seq_show, +}; + +static const struct seq_operations gfs2_sbstats_seq_ops = { +	.start = gfs2_sbstats_seq_start, +	.next  = gfs2_sbstats_seq_next, +	.stop  = gfs2_sbstats_seq_stop, +	.show  = gfs2_sbstats_seq_show, +}; + +static int gfs2_glocks_open(struct inode *inode, struct file *file)  {  	int ret = seq_open_private(file, &gfs2_glock_seq_ops,  				   sizeof(struct gfs2_glock_iter)); @@ -1841,9 +1976,49 @@ static int gfs2_debugfs_open(struct inode *inode, struct file *file)  	return ret;  } -static const struct file_operations gfs2_debug_fops = { +static int gfs2_glstats_open(struct inode *inode, struct file *file) +{ +	int ret = seq_open_private(file, &gfs2_glstats_seq_ops, +				   sizeof(struct gfs2_glock_iter)); +	if (ret == 0) { +		struct seq_file *seq = file->private_data; +		struct gfs2_glock_iter *gi = seq->private; +		gi->sdp = inode->i_private; +	} +	return ret; +} + +static int gfs2_sbstats_open(struct inode *inode, struct file *file) +{ +	int ret = seq_open_private(file, &gfs2_sbstats_seq_ops, +				   sizeof(struct gfs2_glock_iter)); +	if (ret == 0) { +		struct seq_file *seq = file->private_data; +		struct gfs2_glock_iter *gi = seq->private; +		gi->sdp = inode->i_private; +	} +	return ret; +} + +static const struct file_operations gfs2_glocks_fops = { +	.owner   = THIS_MODULE, +	.open    = gfs2_glocks_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release_private, +}; + +static const struct file_operations gfs2_glstats_fops = { +	.owner   = THIS_MODULE, +	.open    = gfs2_glstats_open, +	.read    = seq_read, +	.llseek  = seq_lseek, +	.release = seq_release_private, +}; + +static const struct file_operations gfs2_sbstats_fops = {  	.owner   = THIS_MODULE, -	.open    = gfs2_debugfs_open, +	.open	 = gfs2_sbstats_open,  	.read    = seq_read,  	.llseek  = seq_lseek,  	.release = seq_release_private, @@ -1857,20 +2032,45 @@ int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)  	sdp->debugfs_dentry_glocks = debugfs_create_file("glocks",  							 S_IFREG | S_IRUGO,  							 sdp->debugfs_dir, sdp, -							 &gfs2_debug_fops); +							 &gfs2_glocks_fops);  	if (!sdp->debugfs_dentry_glocks) -		return -ENOMEM; +		goto fail; + +	sdp->debugfs_dentry_glstats = debugfs_create_file("glstats", +							S_IFREG | S_IRUGO, +							sdp->debugfs_dir, sdp, +							&gfs2_glstats_fops); +	if (!sdp->debugfs_dentry_glstats) +		goto fail; + +	sdp->debugfs_dentry_sbstats = debugfs_create_file("sbstats", +							S_IFREG | S_IRUGO, +							sdp->debugfs_dir, sdp, +							&gfs2_sbstats_fops); +	if (!sdp->debugfs_dentry_sbstats) +		goto fail;  	return 0; +fail: +	gfs2_delete_debugfs_file(sdp); +	return -ENOMEM;  }  void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)  { -	if (sdp && sdp->debugfs_dir) { +	if (sdp->debugfs_dir) {  		if (sdp->debugfs_dentry_glocks) {  			debugfs_remove(sdp->debugfs_dentry_glocks);  			sdp->debugfs_dentry_glocks = NULL;  		} +		if (sdp->debugfs_dentry_glstats) { +			debugfs_remove(sdp->debugfs_dentry_glstats); +			sdp->debugfs_dentry_glstats = NULL; +		} +		if (sdp->debugfs_dentry_sbstats) { +			debugfs_remove(sdp->debugfs_dentry_sbstats); +			sdp->debugfs_dentry_sbstats = NULL; +		}  		debugfs_remove(sdp->debugfs_dir);  		sdp->debugfs_dir = NULL;  	} diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 97742a7ea9c..47d0bda5ac2 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -19,6 +19,8 @@  #include <linux/rculist_bl.h>  #include <linux/completion.h>  #include <linux/rbtree.h> +#include <linux/ktime.h> +#include <linux/percpu.h>  #define DIO_WAIT	0x00000010  #define DIO_METADATA	0x00000020 @@ -205,6 +207,22 @@ struct gfs2_glock_operations {  };  enum { +	GFS2_LKS_SRTT = 0,	/* Non blocking smoothed round trip time */ +	GFS2_LKS_SRTTVAR = 1,	/* Non blocking smoothed variance */ +	GFS2_LKS_SRTTB = 2,	/* Blocking smoothed round trip time */ +	GFS2_LKS_SRTTVARB = 3,	/* Blocking smoothed variance */ +	GFS2_LKS_SIRT = 4,	/* Smoothed Inter-request time */ +	GFS2_LKS_SIRTVAR = 5,	/* Smoothed Inter-request variance */ +	GFS2_LKS_DCOUNT = 6,	/* Count of dlm requests */ +	GFS2_LKS_QCOUNT = 7,	/* Count of gfs2_holder queues */ +	GFS2_NR_LKSTATS +}; + +struct gfs2_lkstats { +	s64 stats[GFS2_NR_LKSTATS]; +}; + +enum {  	/* States */  	HIF_HOLDER		= 6,  /* Set for gh that "holds" the glock */  	HIF_FIRST		= 7, @@ -238,10 +256,12 @@ enum {  	GLF_QUEUED			= 12,  	GLF_LRU				= 13,  	GLF_OBJECT			= 14, /* Used only for tracing */ +	GLF_BLOCKING			= 15,  };  struct gfs2_glock {  	struct hlist_bl_node gl_list; +	struct gfs2_sbd *gl_sbd;  	unsigned long gl_flags;		/* GLF_... */  	struct lm_lockname gl_name;  	atomic_t gl_ref; @@ -261,16 +281,14 @@ struct gfs2_glock {  	struct list_head gl_holders;  	const struct gfs2_glock_operations *gl_ops; -	char gl_strname[GDLM_STRNAME_BYTES]; +	ktime_t gl_dstamp; +	struct gfs2_lkstats gl_stats;  	struct dlm_lksb gl_lksb;  	char gl_lvb[32];  	unsigned long gl_tchange;  	void *gl_object;  	struct list_head gl_lru; - -	struct gfs2_sbd *gl_sbd; -  	struct list_head gl_ail_list;  	atomic_t gl_ail_count;  	atomic_t gl_revokes; @@ -560,8 +578,14 @@ struct lm_lockstruct {  	uint32_t *ls_recover_result; /* result of last jid recovery */  }; +struct gfs2_pcpu_lkstats { +	/* One struct for each glock type */ +	struct gfs2_lkstats lkstats[10]; +}; +  struct gfs2_sbd {  	struct super_block *sd_vfs; +	struct gfs2_pcpu_lkstats __percpu *sd_lkstats;  	struct kobject sd_kobj;  	unsigned long sd_flags;	/* SDF_... */  	struct gfs2_sb_host sd_sb; @@ -620,7 +644,6 @@ struct gfs2_sbd {  	int sd_rindex_uptodate;  	spinlock_t sd_rindex_spin; -	struct mutex sd_rindex_mutex;  	struct rb_root sd_rindex_tree;  	unsigned int sd_rgrps;  	unsigned int sd_max_rg_data; @@ -725,8 +748,23 @@ struct gfs2_sbd {  	unsigned long sd_last_warning;  	struct dentry *debugfs_dir;    /* debugfs directory */ -	struct dentry *debugfs_dentry_glocks; /* for debugfs */ +	struct dentry *debugfs_dentry_glocks; +	struct dentry *debugfs_dentry_glstats; +	struct dentry *debugfs_dentry_sbstats;  }; +static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) +{ +	gl->gl_stats.stats[which]++; +} + +static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) +{ +	const struct gfs2_sbd *sdp = gl->gl_sbd; +	preempt_disable(); +	this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++; +	preempt_enable(); +} +  #endif /* __INCORE_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index a7d611b93f0..c98a60ee6df 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -391,10 +391,6 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)  	int error;  	int dblocks = 1; -	error = gfs2_rindex_update(sdp); -	if (error) -		fs_warn(sdp, "rindex update returns %d\n", error); -  	error = gfs2_inplace_reserve(dip, RES_DINODE);  	if (error)  		goto out; @@ -1040,9 +1036,10 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)  	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);  	gfs2_holder_init(ip->i_gl,  LM_ST_EXCLUSIVE, 0, ghs + 1); -	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); +	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1);  	if (!rgd)  		goto out_inodes; +  	gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); @@ -1258,7 +1255,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,  		 * this is the case of the target file already existing  		 * so we unlink before doing the rename  		 */ -		nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr); +		nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1);  		if (nrgd)  			gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);  	} diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 8944d1e32ab..f8411bd1b80 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -18,14 +18,106 @@  #include "glock.h"  #include "util.h"  #include "sys.h" +#include "trace_gfs2.h"  extern struct workqueue_struct *gfs2_control_wq; +/** + * gfs2_update_stats - Update time based stats + * @mv: Pointer to mean/variance structure to update + * @sample: New data to include + * + * @delta is the difference between the current rtt sample and the + * running average srtt. We add 1/8 of that to the srtt in order to + * update the current srtt estimate. The varience estimate is a bit + * more complicated. We subtract the abs value of the @delta from + * the current variance estimate and add 1/4 of that to the running + * total. + * + * Note that the index points at the array entry containing the smoothed + * mean value, and the variance is always in the following entry + * + * Reference: TCP/IP Illustrated, vol 2, p. 831,832 + * All times are in units of integer nanoseconds. Unlike the TCP/IP case, + * they are not scaled fixed point. + */ + +static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, +				     s64 sample) +{ +	s64 delta = sample - s->stats[index]; +	s->stats[index] += (delta >> 3); +	index++; +	s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2); +} + +/** + * gfs2_update_reply_times - Update locking statistics + * @gl: The glock to update + * + * This assumes that gl->gl_dstamp has been set earlier. + * + * The rtt (lock round trip time) is an estimate of the time + * taken to perform a dlm lock request. We update it on each + * reply from the dlm. + * + * The blocking flag is set on the glock for all dlm requests + * which may potentially block due to lock requests from other nodes. + * DLM requests where the current lock state is exclusive, the + * requested state is null (or unlocked) or where the TRY or + * TRY_1CB flags are set are classified as non-blocking. All + * other DLM requests are counted as (potentially) blocking. + */ +static inline void gfs2_update_reply_times(struct gfs2_glock *gl) +{ +	struct gfs2_pcpu_lkstats *lks; +	const unsigned gltype = gl->gl_name.ln_type; +	unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ? +			 GFS2_LKS_SRTTB : GFS2_LKS_SRTT; +	s64 rtt; + +	preempt_disable(); +	rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp)); +	lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats); +	gfs2_update_stats(&gl->gl_stats, index, rtt);		/* Local */ +	gfs2_update_stats(&lks->lkstats[gltype], index, rtt);	/* Global */ +	preempt_enable(); + +	trace_gfs2_glock_lock_time(gl, rtt); +} + +/** + * gfs2_update_request_times - Update locking statistics + * @gl: The glock to update + * + * The irt (lock inter-request times) measures the average time + * between requests to the dlm. It is updated immediately before + * each dlm call. + */ + +static inline void gfs2_update_request_times(struct gfs2_glock *gl) +{ +	struct gfs2_pcpu_lkstats *lks; +	const unsigned gltype = gl->gl_name.ln_type; +	ktime_t dstamp; +	s64 irt; + +	preempt_disable(); +	dstamp = gl->gl_dstamp; +	gl->gl_dstamp = ktime_get_real(); +	irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp)); +	lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats); +	gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt);		/* Local */ +	gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt);	/* Global */ +	preempt_enable(); +} +   static void gdlm_ast(void *arg)  {  	struct gfs2_glock *gl = arg;  	unsigned ret = gl->gl_state; +	gfs2_update_reply_times(gl);  	BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);  	if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) @@ -111,7 +203,7 @@ static int make_mode(const unsigned int lmstate)  static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,  		      const int req)  { -	u32 lkf = 0; +	u32 lkf = DLM_LKF_VALBLK;  	if (gfs_flags & LM_FLAG_TRY)  		lkf |= DLM_LKF_NOQUEUE; @@ -138,26 +230,43 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,  	if (lkid != 0)   		lkf |= DLM_LKF_CONVERT; -	lkf |= DLM_LKF_VALBLK; -  	return lkf;  } +static void gfs2_reverse_hex(char *c, u64 value) +{ +	while (value) { +		*c-- = hex_asc[value & 0x0f]; +		value >>= 4; +	} +} +  static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,  		     unsigned int flags)  {  	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;  	int req;  	u32 lkf; +	char strname[GDLM_STRNAME_BYTES] = "";  	req = make_mode(req_state);  	lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); - +	gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); +	gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); +	if (gl->gl_lksb.sb_lkid) { +		gfs2_update_request_times(gl); +	} else { +		memset(strname, ' ', GDLM_STRNAME_BYTES - 1); +		strname[GDLM_STRNAME_BYTES - 1] = '\0'; +		gfs2_reverse_hex(strname + 7, gl->gl_name.ln_type); +		gfs2_reverse_hex(strname + 23, gl->gl_name.ln_number); +		gl->gl_dstamp = ktime_get_real(); +	}  	/*  	 * Submit the actual lock request.  	 */ -	return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, +	return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname,  			GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);  } @@ -172,6 +281,10 @@ static void gdlm_put_lock(struct gfs2_glock *gl)  		return;  	} +	clear_bit(GLF_BLOCKING, &gl->gl_flags); +	gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); +	gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); +	gfs2_update_request_times(gl);  	error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,  			   NULL, gl);  	if (error) { diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 756fae9eaf8..4752eadc7f6 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -19,6 +19,7 @@  #include <linux/freezer.h>  #include <linux/bio.h>  #include <linux/writeback.h> +#include <linux/list_sort.h>  #include "gfs2.h"  #include "incore.h" @@ -358,7 +359,7 @@ retry:  	return 0;  } -static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) +u64 gfs2_log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)  {  	struct gfs2_journal_extent *je; @@ -467,8 +468,8 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)  void gfs2_log_incr_head(struct gfs2_sbd *sdp)  { -	if (sdp->sd_log_flush_head == sdp->sd_log_tail) -		BUG_ON(sdp->sd_log_flush_head != sdp->sd_log_head); +	BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && +	       (sdp->sd_log_flush_head != sdp->sd_log_head));  	if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {  		sdp->sd_log_flush_head = 0; @@ -476,99 +477,6 @@ void gfs2_log_incr_head(struct gfs2_sbd *sdp)  	}  } -/** - * gfs2_log_write_endio - End of I/O for a log buffer - * @bh: The buffer head - * @uptodate: I/O Status - * - */ - -static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate) -{ -	struct gfs2_sbd *sdp = bh->b_private; -	bh->b_private = NULL; - -	end_buffer_write_sync(bh, uptodate); -	if (atomic_dec_and_test(&sdp->sd_log_in_flight)) -		wake_up(&sdp->sd_log_flush_wait); -} - -/** - * gfs2_log_get_buf - Get and initialize a buffer to use for log control data - * @sdp: The GFS2 superblock - * - * Returns: the buffer_head - */ - -struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp) -{ -	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); -	struct buffer_head *bh; - -	bh = sb_getblk(sdp->sd_vfs, blkno); -	lock_buffer(bh); -	memset(bh->b_data, 0, bh->b_size); -	set_buffer_uptodate(bh); -	clear_buffer_dirty(bh); -	gfs2_log_incr_head(sdp); -	atomic_inc(&sdp->sd_log_in_flight); -	bh->b_private = sdp; -	bh->b_end_io = gfs2_log_write_endio; - -	return bh; -} - -/** - * gfs2_fake_write_endio -  - * @bh: The buffer head - * @uptodate: The I/O Status - * - */ - -static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate) -{ -	struct buffer_head *real_bh = bh->b_private; -	struct gfs2_bufdata *bd = real_bh->b_private; -	struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd; - -	end_buffer_write_sync(bh, uptodate); -	free_buffer_head(bh); -	unlock_buffer(real_bh); -	brelse(real_bh); -	if (atomic_dec_and_test(&sdp->sd_log_in_flight)) -		wake_up(&sdp->sd_log_flush_wait); -} - -/** - * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log - * @sdp: the filesystem - * @data: the data the buffer_head should point to - * - * Returns: the log buffer descriptor - */ - -struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, -				      struct buffer_head *real) -{ -	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); -	struct buffer_head *bh; - -	bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL); -	atomic_set(&bh->b_count, 1); -	bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock); -	set_bh_page(bh, real->b_page, bh_offset(real)); -	bh->b_blocknr = blkno; -	bh->b_size = sdp->sd_sb.sb_bsize; -	bh->b_bdev = sdp->sd_vfs->s_bdev; -	bh->b_private = real; -	bh->b_end_io = gfs2_fake_write_endio; - -	gfs2_log_incr_head(sdp); -	atomic_inc(&sdp->sd_log_in_flight); - -	return bh; -} -  static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)  {  	unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); @@ -583,66 +491,8 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)  	sdp->sd_log_tail = new_tail;  } -/** - * log_write_header - Get and initialize a journal header buffer - * @sdp: The GFS2 superblock - * - * Returns: the initialized log buffer descriptor - */ - -static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) -{ -	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); -	struct buffer_head *bh; -	struct gfs2_log_header *lh; -	unsigned int tail; -	u32 hash; - -	bh = sb_getblk(sdp->sd_vfs, blkno); -	lock_buffer(bh); -	memset(bh->b_data, 0, bh->b_size); -	set_buffer_uptodate(bh); -	clear_buffer_dirty(bh); - -	gfs2_ail1_empty(sdp); -	tail = current_tail(sdp); - -	lh = (struct gfs2_log_header *)bh->b_data; -	memset(lh, 0, sizeof(struct gfs2_log_header)); -	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); -	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); -	lh->lh_header.__pad0 = cpu_to_be64(0); -	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); -	lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); -	lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); -	lh->lh_flags = cpu_to_be32(flags); -	lh->lh_tail = cpu_to_be32(tail); -	lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head); -	hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); -	lh->lh_hash = cpu_to_be32(hash); - -	bh->b_end_io = end_buffer_write_sync; -	get_bh(bh); -	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) -		submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); -	else -		submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); -	wait_on_buffer(bh); -	if (!buffer_uptodate(bh)) -		gfs2_io_error_bh(sdp, bh); -	brelse(bh); - -	if (sdp->sd_log_tail != tail) -		log_pull_tail(sdp, tail); -	else -		gfs2_assert_withdraw(sdp, !pull); - -	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); -	gfs2_log_incr_head(sdp); -} - -static void log_flush_commit(struct gfs2_sbd *sdp) +static void log_flush_wait(struct gfs2_sbd *sdp)  {  	DEFINE_WAIT(wait); @@ -655,8 +505,20 @@ static void log_flush_commit(struct gfs2_sbd *sdp)  		} while(atomic_read(&sdp->sd_log_in_flight));  		finish_wait(&sdp->sd_log_flush_wait, &wait);  	} +} + +static int bd_cmp(void *priv, struct list_head *a, struct list_head *b) +{ +	struct gfs2_bufdata *bda, *bdb; + +	bda = list_entry(a, struct gfs2_bufdata, bd_le.le_list); +	bdb = list_entry(b, struct gfs2_bufdata, bd_le.le_list); -	log_write_header(sdp, 0, 0); +	if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) +		return -1; +	if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr) +		return 1; +	return 0;  }  static void gfs2_ordered_write(struct gfs2_sbd *sdp) @@ -666,6 +528,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)  	LIST_HEAD(written);  	gfs2_log_lock(sdp); +	list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp);  	while (!list_empty(&sdp->sd_log_le_ordered)) {  		bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list);  		list_move(&bd->bd_le.le_list, &written); @@ -711,6 +574,68 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)  }  /** + * log_write_header - Get and initialize a journal header buffer + * @sdp: The GFS2 superblock + * + * Returns: the initialized log buffer descriptor + */ + +static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) +{ +	u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); +	struct buffer_head *bh; +	struct gfs2_log_header *lh; +	unsigned int tail; +	u32 hash; + +	bh = sb_getblk(sdp->sd_vfs, blkno); +	lock_buffer(bh); +	memset(bh->b_data, 0, bh->b_size); +	set_buffer_uptodate(bh); +	clear_buffer_dirty(bh); + +	gfs2_ail1_empty(sdp); +	tail = current_tail(sdp); + +	lh = (struct gfs2_log_header *)bh->b_data; +	memset(lh, 0, sizeof(struct gfs2_log_header)); +	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); +	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); +	lh->lh_header.__pad0 = cpu_to_be64(0); +	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); +	lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); +	lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); +	lh->lh_flags = cpu_to_be32(flags); +	lh->lh_tail = cpu_to_be32(tail); +	lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head); +	hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); +	lh->lh_hash = cpu_to_be32(hash); + +	bh->b_end_io = end_buffer_write_sync; +	get_bh(bh); +	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { +		gfs2_ordered_wait(sdp); +		log_flush_wait(sdp); +		submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); +	} else { +		submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); +	} +	wait_on_buffer(bh); + +	if (!buffer_uptodate(bh)) +		gfs2_io_error_bh(sdp, bh); +	brelse(bh); + +	if (sdp->sd_log_tail != tail) +		log_pull_tail(sdp, tail); +	else +		gfs2_assert_withdraw(sdp, !pull); + +	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); +	gfs2_log_incr_head(sdp); +} + +/**   * gfs2_log_flush - flush incore transaction(s)   * @sdp: the filesystem   * @gl: The glock structure to flush.  If NULL, flush the whole incore log @@ -753,11 +678,10 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)  	gfs2_ordered_write(sdp);  	lops_before_commit(sdp); -	gfs2_ordered_wait(sdp); -	if (sdp->sd_log_head != sdp->sd_log_flush_head) -		log_flush_commit(sdp); -	else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ +	if (sdp->sd_log_head != sdp->sd_log_flush_head) { +		log_write_header(sdp, 0, 0); +	} else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){  		gfs2_log_lock(sdp);  		atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */  		trace_gfs2_log_blocks(sdp, -1); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index ab0621698b7..ff07454b582 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -53,10 +53,7 @@ extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,  extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);  extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); - -extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); -extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, -				      struct buffer_head *real); +extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp, unsigned int lbn);  extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);  extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);  extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 0301be655b1..6b1efb594d9 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -12,6 +12,7 @@  #include <linux/spinlock.h>  #include <linux/completion.h>  #include <linux/buffer_head.h> +#include <linux/mempool.h>  #include <linux/gfs2_ondisk.h>  #include <linux/bio.h>  #include <linux/fs.h> @@ -76,7 +77,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)  	if (bi->bi_clone == 0)  		return;  	if (sdp->sd_args.ar_discard) -		gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi); +		gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL);  	memcpy(bi->bi_clone + bi->bi_offset,  	       bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);  	clear_bit(GBF_FULL, &bi->bi_flags); @@ -143,6 +144,98 @@ static inline __be64 *bh_ptr_end(struct buffer_head *bh)  	return (__force __be64 *)(bh->b_data + bh->b_size);  } +/** + * gfs2_log_write_endio - End of I/O for a log buffer + * @bh: The buffer head + * @uptodate: I/O Status + * + */ + +static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate) +{ +	struct gfs2_sbd *sdp = bh->b_private; +	bh->b_private = NULL; + +	end_buffer_write_sync(bh, uptodate); +	if (atomic_dec_and_test(&sdp->sd_log_in_flight)) +		wake_up(&sdp->sd_log_flush_wait); +} + +/** + * gfs2_log_get_buf - Get and initialize a buffer to use for log control data + * @sdp: The GFS2 superblock + * + * tReturns: the buffer_head + */ + +static struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp) +{ +	u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); +	struct buffer_head *bh; + +	bh = sb_getblk(sdp->sd_vfs, blkno); +	lock_buffer(bh); +	memset(bh->b_data, 0, bh->b_size); +	set_buffer_uptodate(bh); +	clear_buffer_dirty(bh); +	gfs2_log_incr_head(sdp); +	atomic_inc(&sdp->sd_log_in_flight); +	bh->b_private = sdp; +	bh->b_end_io = gfs2_log_write_endio; + +	return bh; +} + +/** + * gfs2_fake_write_endio -  + * @bh: The buffer head + * @uptodate: The I/O Status + * + */ + +static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate) +{ +	struct buffer_head *real_bh = bh->b_private; +	struct gfs2_bufdata *bd = real_bh->b_private; +	struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd; + +	end_buffer_write_sync(bh, uptodate); +	mempool_free(bh, gfs2_bh_pool); +	unlock_buffer(real_bh); +	brelse(real_bh); +	if (atomic_dec_and_test(&sdp->sd_log_in_flight)) +		wake_up(&sdp->sd_log_flush_wait); +} + +/** + * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log + * @sdp: the filesystem + * @data: the data the buffer_head should point to + * + * Returns: the log buffer descriptor + */ + +static struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, +				      struct buffer_head *real) +{ +	u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); +	struct buffer_head *bh; + +	bh = mempool_alloc(gfs2_bh_pool, GFP_NOFS); +	atomic_set(&bh->b_count, 1); +	bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock); +	set_bh_page(bh, real->b_page, bh_offset(real)); +	bh->b_blocknr = blkno; +	bh->b_size = sdp->sd_sb.sb_bsize; +	bh->b_bdev = sdp->sd_vfs->s_bdev; +	bh->b_private = real; +	bh->b_end_io = gfs2_fake_write_endio; + +	gfs2_log_incr_head(sdp); +	atomic_inc(&sdp->sd_log_in_flight); + +	return bh; +}  static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)  { @@ -553,11 +646,11 @@ static void gfs2_check_magic(struct buffer_head *bh)  	__be32 *ptr;  	clear_buffer_escaped(bh); -	kaddr = kmap_atomic(bh->b_page, KM_USER0); +	kaddr = kmap_atomic(bh->b_page);  	ptr = kaddr + bh_offset(bh);  	if (*ptr == cpu_to_be32(GFS2_MAGIC))  		set_buffer_escaped(bh); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  }  static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, @@ -594,10 +687,10 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,  		if (buffer_escaped(bd->bd_bh)) {  			void *kaddr;  			bh1 = gfs2_log_get_buf(sdp); -			kaddr = kmap_atomic(bd->bd_bh->b_page, KM_USER0); +			kaddr = kmap_atomic(bd->bd_bh->b_page);  			memcpy(bh1->b_data, kaddr + bh_offset(bd->bd_bh),  			       bh1->b_size); -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr);  			*(__be32 *)bh1->b_data = 0;  			clear_buffer_escaped(bd->bd_bh);  			unlock_buffer(bd->bd_bh); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index a8d9bcd0e19..754426b1e52 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -17,6 +17,7 @@  #include <linux/rcupdate.h>  #include <linux/rculist_bl.h>  #include <linux/atomic.h> +#include <linux/mempool.h>  #include "gfs2.h"  #include "incore.h" @@ -69,6 +70,16 @@ static void gfs2_init_gl_aspace_once(void *foo)  	address_space_init_once(mapping);  } +static void *gfs2_bh_alloc(gfp_t mask, void *data) +{ +	return alloc_buffer_head(mask); +} + +static void gfs2_bh_free(void *ptr, void *data) +{ +	return free_buffer_head(ptr); +} +  /**   * init_gfs2_fs - Register GFS2 as a filesystem   * @@ -151,6 +162,10 @@ static int __init init_gfs2_fs(void)  	gfs2_control_wq = alloc_workqueue("gfs2_control",  			       WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0);  	if (!gfs2_control_wq) +		goto fail_recovery; + +	gfs2_bh_pool = mempool_create(1024, gfs2_bh_alloc, gfs2_bh_free, NULL); +	if (!gfs2_bh_pool)  		goto fail_control;  	gfs2_register_debugfs(); @@ -160,6 +175,8 @@ static int __init init_gfs2_fs(void)  	return 0;  fail_control: +	destroy_workqueue(gfs2_control_wq); +fail_recovery:  	destroy_workqueue(gfs_recovery_wq);  fail_wq:  	unregister_filesystem(&gfs2meta_fs_type); @@ -208,6 +225,7 @@ static void __exit exit_gfs2_fs(void)  	rcu_barrier(); +	mempool_destroy(gfs2_bh_pool);  	kmem_cache_destroy(gfs2_quotad_cachep);  	kmem_cache_destroy(gfs2_rgrpd_cachep);  	kmem_cache_destroy(gfs2_bufdata_cachep); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 6aacf3f230a..6f3a18f9e17 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -68,6 +68,12 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)  	sb->s_fs_info = sdp;  	sdp->sd_vfs = sb; +	sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats); +	if (!sdp->sd_lkstats) { +		kfree(sdp); +		return NULL; +	} +  	set_bit(SDF_NOJOURNALID, &sdp->sd_flags);  	gfs2_tune_init(&sdp->sd_tune); @@ -77,7 +83,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)  	spin_lock_init(&sdp->sd_statfs_spin);  	spin_lock_init(&sdp->sd_rindex_spin); -	mutex_init(&sdp->sd_rindex_mutex);  	sdp->sd_rindex_tree.rb_node = NULL;  	INIT_LIST_HEAD(&sdp->sd_jindex_list); @@ -431,10 +436,9 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,  		fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));  		return PTR_ERR(inode);  	} -	dentry = d_alloc_root(inode); +	dentry = d_make_root(inode);  	if (!dentry) {  		fs_err(sdp, "can't alloc %s dentry\n", name); -		iput(inode);  		return -ENOMEM;  	}  	*dptr = dentry; @@ -800,6 +804,11 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)  		fs_err(sdp, "can't get quota file inode: %d\n", error);  		goto fail_rindex;  	} + +	error = gfs2_rindex_update(sdp); +	if (error) +		goto fail_qinode; +  	return 0;  fail_qinode: @@ -1216,6 +1225,7 @@ fail_sys:  	gfs2_sys_fs_del(sdp);  fail:  	gfs2_delete_debugfs_file(sdp); +	free_percpu(sdp->sd_lkstats);  	kfree(sdp);  	sb->s_fs_info = NULL;  	return error; @@ -1388,6 +1398,7 @@ static void gfs2_kill_sb(struct super_block *sb)  	shrink_dcache_sb(sb);  	kill_block_super(sb);  	gfs2_delete_debugfs_file(sdp); +	free_percpu(sdp->sd_lkstats);  	kfree(sdp);  } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index a45b21b0391..6019da3dcae 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -681,7 +681,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,  	ptr = qp;  	nbytes = sizeof(struct gfs2_quota);  get_a_page: -	page = grab_cache_page(mapping, index); +	page = find_or_create_page(mapping, index, GFP_NOFS);  	if (!page)  		return -ENOMEM; @@ -720,12 +720,12 @@ get_a_page:  	gfs2_trans_add_bh(ip->i_gl, bh, 0); -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)  		nbytes = PAGE_CACHE_SIZE - offset;  	memcpy(kaddr + offset, ptr, nbytes);  	flush_dcache_page(page); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	unlock_page(page);  	page_cache_release(page); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 981bfa32121..19bde40b486 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -327,23 +327,34 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)   * Returns: The resource group, or NULL if not found   */ -struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)  { -	struct rb_node **newn; +	struct rb_node *n, *next;  	struct gfs2_rgrpd *cur; +	if (gfs2_rindex_update(sdp)) +		return NULL; +  	spin_lock(&sdp->sd_rindex_spin); -	newn = &sdp->sd_rindex_tree.rb_node; -	while (*newn) { -		cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node); +	n = sdp->sd_rindex_tree.rb_node; +	while (n) { +		cur = rb_entry(n, struct gfs2_rgrpd, rd_node); +		next = NULL;  		if (blk < cur->rd_addr) -			newn = &((*newn)->rb_left); +			next = n->rb_left;  		else if (blk >= cur->rd_data0 + cur->rd_data) -			newn = &((*newn)->rb_right); -		else { +			next = n->rb_right; +		if (next == NULL) {  			spin_unlock(&sdp->sd_rindex_spin); +			if (exact) { +				if (blk < cur->rd_addr) +					return NULL; +				if (blk >= cur->rd_data0 + cur->rd_data) +					return NULL; +			}  			return cur;  		} +		n = next;  	}  	spin_unlock(&sdp->sd_rindex_spin); @@ -532,7 +543,6 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)  	struct file_ra_state ra_state;  	int error, rgrps; -	mutex_lock(&sdp->sd_rindex_mutex);  	file_ra_state_init(&ra_state, inode->i_mapping);  	for (rgrps = 0;; rgrps++) {  		loff_t pos = rgrps * sizeof(struct gfs2_rindex); @@ -545,11 +555,10 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)  			break;  		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);  	} -	mutex_unlock(&sdp->sd_rindex_mutex);  	return total_data;  } -static void rgd_insert(struct gfs2_rgrpd *rgd) +static int rgd_insert(struct gfs2_rgrpd *rgd)  {  	struct gfs2_sbd *sdp = rgd->rd_sbd;  	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL; @@ -565,11 +574,13 @@ static void rgd_insert(struct gfs2_rgrpd *rgd)  		else if (rgd->rd_addr > cur->rd_addr)  			newn = &((*newn)->rb_right);  		else -			return; +			return -EEXIST;  	}  	rb_link_node(&rgd->rd_node, parent, newn);  	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree); +	sdp->sd_rgrps++; +	return 0;  }  /** @@ -623,10 +634,12 @@ static int read_rindex_entry(struct gfs2_inode *ip,  	if (rgd->rd_data > sdp->sd_max_rg_data)  		sdp->sd_max_rg_data = rgd->rd_data;  	spin_lock(&sdp->sd_rindex_spin); -	rgd_insert(rgd); -	sdp->sd_rgrps++; +	error = rgd_insert(rgd);  	spin_unlock(&sdp->sd_rindex_spin); -	return error; +	if (!error) +		return 0; + +	error = 0; /* someone else read in the rgrp; free it and ignore it */  fail:  	kfree(rgd->rd_bits); @@ -683,20 +696,22 @@ int gfs2_rindex_update(struct gfs2_sbd *sdp)  	struct gfs2_glock *gl = ip->i_gl;  	struct gfs2_holder ri_gh;  	int error = 0; +	int unlock_required = 0;  	/* Read new copy from disk if we don't have the latest */  	if (!sdp->sd_rindex_uptodate) { -		mutex_lock(&sdp->sd_rindex_mutex); -		error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); -		if (error) -			return error; +		if (!gfs2_glock_is_locked_by_me(gl)) { +			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); +			if (error) +				return error; +			unlock_required = 1; +		}  		if (!sdp->sd_rindex_uptodate)  			error = gfs2_ri_update(ip); -		gfs2_glock_dq_uninit(&ri_gh); -		mutex_unlock(&sdp->sd_rindex_mutex); +		if (unlock_required) +			gfs2_glock_dq_uninit(&ri_gh);  	} -  	return error;  } @@ -805,9 +820,9 @@ void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)  } -void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, +int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,  			     struct buffer_head *bh, -			     const struct gfs2_bitmap *bi) +			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)  {  	struct super_block *sb = sdp->sd_vfs;  	struct block_device *bdev = sb->s_bdev; @@ -818,11 +833,19 @@ void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,  	sector_t nr_sects = 0;  	int rv;  	unsigned int x; +	u32 trimmed = 0; +	u8 diff;  	for (x = 0; x < bi->bi_len; x++) { -		const u8 *orig = bh->b_data + bi->bi_offset + x; -		const u8 *clone = bi->bi_clone + bi->bi_offset + x; -		u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); +		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data; +		clone += bi->bi_offset; +		clone += x; +		if (bh) { +			const u8 *orig = bh->b_data + bi->bi_offset + x; +			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); +		} else { +			diff = ~(*clone | (*clone >> 1)); +		}  		diff &= 0x55;  		if (diff == 0)  			continue; @@ -833,11 +856,14 @@ void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,  				if (nr_sects == 0)  					goto start_new_extent;  				if ((start + nr_sects) != blk) { -					rv = blkdev_issue_discard(bdev, start, -							    nr_sects, GFP_NOFS, -							    0); -					if (rv) -						goto fail; +					if (nr_sects >= minlen) { +						rv = blkdev_issue_discard(bdev, +							start, nr_sects, +							GFP_NOFS, 0); +						if (rv) +							goto fail; +						trimmed += nr_sects; +					}  					nr_sects = 0;  start_new_extent:  					start = blk; @@ -848,15 +874,104 @@ start_new_extent:  			blk += sects_per_blk;  		}  	} -	if (nr_sects) { +	if (nr_sects >= minlen) {  		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);  		if (rv)  			goto fail; +		trimmed += nr_sects;  	} -	return; +	if (ptrimmed) +		*ptrimmed = trimmed; +	return 0; +  fail: -	fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv); +	if (sdp->sd_args.ar_discard) +		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv);  	sdp->sd_args.ar_discard = 0; +	return -EIO; +} + +/** + * gfs2_fitrim - Generate discard requests for unused bits of the filesystem + * @filp: Any file on the filesystem + * @argp: Pointer to the arguments (also used to pass result) + * + * Returns: 0 on success, otherwise error code + */ + +int gfs2_fitrim(struct file *filp, void __user *argp) +{ +	struct inode *inode = filp->f_dentry->d_inode; +	struct gfs2_sbd *sdp = GFS2_SB(inode); +	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev); +	struct buffer_head *bh; +	struct gfs2_rgrpd *rgd; +	struct gfs2_rgrpd *rgd_end; +	struct gfs2_holder gh; +	struct fstrim_range r; +	int ret = 0; +	u64 amt; +	u64 trimmed = 0; +	unsigned int x; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	if (!blk_queue_discard(q)) +		return -EOPNOTSUPP; + +	if (argp == NULL) { +		r.start = 0; +		r.len = ULLONG_MAX; +		r.minlen = 0; +	} else if (copy_from_user(&r, argp, sizeof(r))) +		return -EFAULT; + +	rgd = gfs2_blk2rgrpd(sdp, r.start, 0); +	rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0); + +	while (1) { + +		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); +		if (ret) +			goto out; + +		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) { +			/* Trim each bitmap in the rgrp */ +			for (x = 0; x < rgd->rd_length; x++) { +				struct gfs2_bitmap *bi = rgd->rd_bits + x; +				ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt); +				if (ret) { +					gfs2_glock_dq_uninit(&gh); +					goto out; +				} +				trimmed += amt; +			} + +			/* Mark rgrp as having been trimmed */ +			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0); +			if (ret == 0) { +				bh = rgd->rd_bits[0].bi_bh; +				rgd->rd_flags |= GFS2_RGF_TRIMMED; +				gfs2_trans_add_bh(rgd->rd_gl, bh, 1); +				gfs2_rgrp_out(rgd, bh->b_data); +				gfs2_trans_end(sdp); +			} +		} +		gfs2_glock_dq_uninit(&gh); + +		if (rgd == rgd_end) +			break; + +		rgd = gfs2_rgrpd_get_next(rgd); +	} + +out: +	r.len = trimmed << 9; +	if (argp && copy_to_user(argp, &r, sizeof(r))) +		return -EFAULT; + +	return ret;  }  /** @@ -1003,7 +1118,7 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)  	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))  		rgd = begin = ip->i_rgd;  	else -		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal); +		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);  	if (rgd == NULL)  		return -EBADSLT; @@ -1288,7 +1403,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,  	u32 length, rgrp_blk, buf_blk;  	unsigned int buf; -	rgd = gfs2_blk2rgrpd(sdp, bstart); +	rgd = gfs2_blk2rgrpd(sdp, bstart, 1);  	if (!rgd) {  		if (gfs2_consist(sdp))  			fs_err(sdp, "block = %llu\n", (unsigned long long)bstart); @@ -1469,7 +1584,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)  		return;  	trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);  	rgd->rd_free += blen; - +	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;  	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);  	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1555,14 +1670,9 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)  {  	struct gfs2_rgrpd *rgd;  	struct gfs2_holder rgd_gh; -	int error; - -	error = gfs2_rindex_update(sdp); -	if (error) -		return error; +	int error = -EINVAL; -	error = -EINVAL; -	rgd = gfs2_blk2rgrpd(sdp, no_addr); +	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);  	if (!rgd)  		goto fail; @@ -1605,7 +1715,7 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,  	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))  		rgd = ip->i_rgd;  	else -		rgd = gfs2_blk2rgrpd(sdp, block); +		rgd = gfs2_blk2rgrpd(sdp, block, 1);  	if (!rgd) {  		fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);  		return; diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index ceec9106cdf..b4b10f4de25 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -11,6 +11,7 @@  #define __RGRP_DOT_H__  #include <linux/slab.h> +#include <linux/uaccess.h>  struct gfs2_rgrpd;  struct gfs2_sbd; @@ -18,7 +19,7 @@ struct gfs2_holder;  extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); -extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); +extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact);  extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);  extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); @@ -62,8 +63,9 @@ extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);  extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);  extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);  extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); -extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, -				    struct buffer_head *bh, -				    const struct gfs2_bitmap *bi); +extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, +				   struct buffer_head *bh, +				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); +extern int gfs2_fitrim(struct file *filp, void __user *argp);  #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 4553ce515f6..6172fa77ad5 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1417,7 +1417,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)  	if (error)  		goto out; -	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); +	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1);  	if (!rgd) {  		gfs2_consist_inode(ip);  		error = -EIO; @@ -1557,6 +1557,7 @@ out:  	end_writeback(inode);  	gfs2_dir_hash_inval(ip);  	ip->i_gl->gl_object = NULL; +	flush_delayed_work_sync(&ip->i_gl->gl_work);  	gfs2_glock_add_to_lru(ip->i_gl);  	gfs2_glock_put(ip->i_gl);  	ip->i_gl = NULL; diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 5d07609ec57..dfa89cd7553 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -11,6 +11,7 @@  #include <linux/dlmconstants.h>  #include <linux/gfs2_ondisk.h>  #include <linux/writeback.h> +#include <linux/ktime.h>  #include "incore.h"  #include "glock.h" @@ -43,7 +44,8 @@  	{(1UL << GLF_FROZEN),			"F" },		\  	{(1UL << GLF_QUEUED),			"q" },		\  	{(1UL << GLF_LRU),			"L" },		\ -	{(1UL << GLF_OBJECT),			"o" }) +	{(1UL << GLF_OBJECT),			"o" },		\ +	{(1UL << GLF_BLOCKING),			"b" })  #ifndef NUMPTY  #define NUMPTY @@ -236,6 +238,62 @@ TRACE_EVENT(gfs2_glock_queue,  		  glock_trace_name(__entry->state))  ); +/* DLM sends a reply to GFS2 */ +TRACE_EVENT(gfs2_glock_lock_time, + +	TP_PROTO(const struct gfs2_glock *gl, s64 tdiff), + +	TP_ARGS(gl, tdiff), + +	TP_STRUCT__entry( +		__field(	dev_t,	dev		) +		__field(	u64,	glnum		) +		__field(	u32,	gltype		) +		__field(	int,	status		) +		__field(	char,	flags		) +		__field(	s64,	tdiff		) +		__field(	s64,	srtt		) +		__field(	s64,	srttvar		) +		__field(	s64,	srttb		) +		__field(	s64,	srttvarb	) +		__field(	s64,	sirt		) +		__field(	s64,	sirtvar		) +		__field(	s64,	dcount		) +		__field(	s64,	qcount		) +	), + +	TP_fast_assign( +		__entry->dev            = gl->gl_sbd->sd_vfs->s_dev; +		__entry->glnum          = gl->gl_name.ln_number; +		__entry->gltype         = gl->gl_name.ln_type; +		__entry->status		= gl->gl_lksb.sb_status; +		__entry->flags		= gl->gl_lksb.sb_flags; +		__entry->tdiff		= tdiff; +		__entry->srtt		= gl->gl_stats.stats[GFS2_LKS_SRTT]; +		__entry->srttvar	= gl->gl_stats.stats[GFS2_LKS_SRTTVAR]; +		__entry->srttb		= gl->gl_stats.stats[GFS2_LKS_SRTTB]; +		__entry->srttvarb	= gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; +		__entry->sirt		= gl->gl_stats.stats[GFS2_LKS_SIRT]; +		__entry->sirtvar	= gl->gl_stats.stats[GFS2_LKS_SIRTVAR]; +		__entry->dcount		= gl->gl_stats.stats[GFS2_LKS_DCOUNT]; +		__entry->qcount		= gl->gl_stats.stats[GFS2_LKS_QCOUNT]; +	), + +	TP_printk("%u,%u glock %d:%lld status:%d flags:%02x tdiff:%lld srtt:%lld/%lld srttb:%lld/%lld sirt:%lld/%lld dcnt:%lld qcnt:%lld", +		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, +		  (unsigned long long)__entry->glnum, +		  __entry->status, __entry->flags, +		  (long long)__entry->tdiff, +		  (long long)__entry->srtt, +		  (long long)__entry->srttvar, +		  (long long)__entry->srttb, +		  (long long)__entry->srttvarb, +		  (long long)__entry->sirt, +		  (long long)__entry->sirtvar, +		  (long long)__entry->dcount, +		  (long long)__entry->qcount) +); +  /* Section 2 - Log/journal   *   * Objectives: diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 53511291fe3..9e7765e8e7b 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -25,6 +25,7 @@ struct kmem_cache *gfs2_inode_cachep __read_mostly;  struct kmem_cache *gfs2_bufdata_cachep __read_mostly;  struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;  struct kmem_cache *gfs2_quotad_cachep __read_mostly; +mempool_t *gfs2_bh_pool __read_mostly;  void gfs2_assert_i(struct gfs2_sbd *sdp)  { diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index b432e04600d..a4ce76c67db 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -10,6 +10,8 @@  #ifndef __UTIL_DOT_H__  #define __UTIL_DOT_H__ +#include <linux/mempool.h> +  #include "incore.h"  #define fs_printk(level, fs, fmt, arg...) \ @@ -150,6 +152,7 @@ extern struct kmem_cache *gfs2_inode_cachep;  extern struct kmem_cache *gfs2_bufdata_cachep;  extern struct kmem_cache *gfs2_rgrpd_cachep;  extern struct kmem_cache *gfs2_quotad_cachep; +extern mempool_t *gfs2_bh_pool;  static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,  					   unsigned int *p) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index e9636591b5d..2e5ba425cae 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -251,7 +251,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,  	if (!blks)  		return 0; -	rgd = gfs2_blk2rgrpd(sdp, bn); +	rgd = gfs2_blk2rgrpd(sdp, bn, 1);  	if (!rgd) {  		gfs2_consist_inode(ip);  		return -EIO; @@ -1439,7 +1439,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)  	struct gfs2_holder gh;  	int error; -	rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr); +	rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr, 1);  	if (!rgd) {  		gfs2_consist_inode(ip);  		return -EIO; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 8137fb3e678..7b4c537d6e1 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -430,15 +430,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)  	sb->s_d_op = &hfs_dentry_operations;  	res = -ENOMEM; -	sb->s_root = d_alloc_root(root_inode); +	sb->s_root = d_make_root(root_inode);  	if (!sb->s_root) -		goto bail_iput; +		goto bail_no_root;  	/* everything's okay */  	return 0; -bail_iput: -	iput(root_inode);  bail_no_root:  	printk(KERN_ERR "hfs: get root inode failed.\n");  bail: diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 21a5b7fc6db..4e75ac646fe 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -317,6 +317,11 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)  /* + * hfs+-specific ioctl for making the filesystem bootable + */ +#define HFSPLUS_IOC_BLESS _IO('h', 0x80) + +/*   * Functions in any *.c used in other files   */ diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 927cdd6d5bf..921967e5abb 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -117,7 +117,7 @@ struct hfsplus_vh {  	__be32 write_count;  	__be64 encodings_bmp; -	u8 finder_info[32]; +	u32 finder_info[8];  	struct hfsplus_fork_raw alloc_file;  	struct hfsplus_fork_raw ext_file; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 6643b242bdd..82b69ee4dac 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -193,6 +193,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir,  	mutex_init(&hip->extents_lock);  	hip->extent_state = 0;  	hip->flags = 0; +	hip->userflags = 0;  	set_bit(HFSPLUS_I_RSRC, &hip->flags);  	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); @@ -400,6 +401,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)  	atomic_set(&hip->opencnt, 0);  	hip->extent_state = 0;  	hip->flags = 0; +	hip->userflags = 0;  	memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));  	memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));  	hip->alloc_blocks = 0; diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index f66c7655b3f..c640ba57074 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -20,6 +20,38 @@  #include <asm/uaccess.h>  #include "hfsplus_fs.h" +/* + * "Blessing" an HFS+ filesystem writes metadata to the superblock informing + * the platform firmware which file to boot from + */ +static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) +{ +	struct dentry *dentry = file->f_path.dentry; +	struct inode *inode = dentry->d_inode; +	struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); +	struct hfsplus_vh *vh = sbi->s_vhdr; +	struct hfsplus_vh *bvh = sbi->s_backup_vhdr; + +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; + +	mutex_lock(&sbi->vh_mutex); + +	/* Directory containing the bootable system */ +	vh->finder_info[0] = bvh->finder_info[0] = +		cpu_to_be32(parent_ino(dentry)); + +	/* Bootloader */ +	vh->finder_info[1] = bvh->finder_info[1] = cpu_to_be32(inode->i_ino); + +	/* Per spec, the OS X system folder - same as finder_info[0] here */ +	vh->finder_info[5] = bvh->finder_info[5] = +		cpu_to_be32(parent_ino(dentry)); + +	mutex_unlock(&sbi->vh_mutex); +	return 0; +} +  static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)  {  	struct inode *inode = file->f_path.dentry->d_inode; @@ -108,6 +140,8 @@ long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)  		return hfsplus_ioctl_getflags(file, argp);  	case HFSPLUS_IOC_EXT2_SETFLAGS:  		return hfsplus_ioctl_setflags(file, argp); +	case HFSPLUS_IOC_BLESS: +		return hfsplus_ioctl_bless(file, argp);  	default:  		return -ENOTTY;  	} diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 427682ca9e4..ceb1c281eef 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -465,6 +465,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)  		goto out_put_alloc_file;  	} +	sb->s_d_op = &hfsplus_dentry_operations; +	sb->s_root = d_make_root(root); +	if (!sb->s_root) { +		err = -ENOMEM; +		goto out_put_alloc_file; +	} +  	str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;  	str.name = HFSP_HIDDENDIR_NAME;  	err = hfs_find_init(sbi->cat_tree, &fd); @@ -515,13 +522,6 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)  		}  	} -	sb->s_d_op = &hfsplus_dentry_operations; -	sb->s_root = d_alloc_root(root); -	if (!sb->s_root) { -		err = -ENOMEM; -		goto out_put_hidden_dir; -	} -  	unload_nls(sbi->nls);  	sbi->nls = nls;  	return 0; @@ -529,7 +529,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)  out_put_hidden_dir:  	iput(sbi->hidden_dir);  out_put_root: -	iput(root); +	dput(sb->s_root); +	sb->s_root = NULL;  out_put_alloc_file:  	iput(sbi->alloc_file);  out_close_cat_tree: diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 3cbfa93cd78..1fe731337f0 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -67,7 +67,8 @@ extern int access_file(char *path, int r, int w, int x);  extern int open_file(char *path, int r, int w, int append);  extern void *open_dir(char *path, int *err_out);  extern char *read_dir(void *stream, unsigned long long *pos, -		      unsigned long long *ino_out, int *len_out); +		      unsigned long long *ino_out, int *len_out, +		      unsigned int *type_out);  extern void close_file(void *stream);  extern int replace_file(int oldfd, int fd);  extern void close_dir(void *stream); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e130bd46d67..07c516bfea7 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -283,6 +283,7 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)  	char *name;  	unsigned long long next, ino;  	int error, len; +	unsigned int type;  	name = dentry_name(file->f_path.dentry);  	if (name == NULL) @@ -292,9 +293,9 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)  	if (dir == NULL)  		return -error;  	next = file->f_pos; -	while ((name = read_dir(dir, &next, &ino, &len)) != NULL) { +	while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {  		error = (*filldir)(ent, name, len, file->f_pos, -				   ino, DT_UNKNOWN); +				   ino, type);  		if (error) break;  		file->f_pos = next;  	} @@ -966,9 +967,9 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)  	}  	err = -ENOMEM; -	sb->s_root = d_alloc_root(root_inode); +	sb->s_root = d_make_root(root_inode);  	if (sb->s_root == NULL) -		goto out_put; +		goto out;  	return 0; diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index dd7bc38a382..a74ad0d371c 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -98,7 +98,8 @@ void *open_dir(char *path, int *err_out)  }  char *read_dir(void *stream, unsigned long long *pos, -	       unsigned long long *ino_out, int *len_out) +	       unsigned long long *ino_out, int *len_out, +	       unsigned int *type_out)  {  	DIR *dir = stream;  	struct dirent *ent; @@ -109,6 +110,7 @@ char *read_dir(void *stream, unsigned long long *pos,  		return NULL;  	*len_out = strlen(ent->d_name);  	*ino_out = ent->d_ino; +	*type_out = ent->d_type;  	*pos = telldir(dir);  	return ent->d_name;  } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 3690467c944..54f6eccb79d 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -625,11 +625,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)  	hpfs_init_inode(root);  	hpfs_read_inode(root);  	unlock_new_inode(root); -	s->s_root = d_alloc_root(root); -	if (!s->s_root) { -		iput(root); +	s->s_root = d_make_root(root); +	if (!s->s_root)  		goto bail0; -	}  	/*  	 * find the root directory's . pointer & finish filling in the inode diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index d92f4ce8092..a80e45a690a 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -726,17 +726,12 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)  	err = -ENOMEM;  	root_inode = get_inode(sb, dget(proc_mnt->mnt_root)); -	if (!root_inode) -		goto out_mntput; - -	sb->s_root = d_alloc_root(root_inode); +	sb->s_root = d_make_root(root_inode);  	if (!sb->s_root) -		goto out_iput; +		goto out_mntput;  	return 0; - out_iput: -	iput(root_inode);   out_mntput:  	mntput(proc_mnt);   out: diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 1e85a7ac021..ea251749d9d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -41,6 +41,25 @@ const struct file_operations hugetlbfs_file_operations;  static const struct inode_operations hugetlbfs_dir_inode_operations;  static const struct inode_operations hugetlbfs_inode_operations; +struct hugetlbfs_config { +	uid_t   uid; +	gid_t   gid; +	umode_t mode; +	long	nr_blocks; +	long	nr_inodes; +	struct hstate *hstate; +}; + +struct hugetlbfs_inode_info { +	struct shared_policy policy; +	struct inode vfs_inode; +}; + +static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) +{ +	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); +} +  static struct backing_dev_info hugetlbfs_backing_dev_info = {  	.name		= "hugetlbfs",  	.ra_pages	= 0,	/* No readahead */ @@ -154,10 +173,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,  			return addr;  	} -	start_addr = mm->free_area_cache; - -	if (len <= mm->cached_hole_size) +	if (len > mm->cached_hole_size) +		start_addr = mm->free_area_cache; +	else {  		start_addr = TASK_UNMAPPED_BASE; +		mm->cached_hole_size = 0; +	}  full_search:  	addr = ALIGN(start_addr, huge_page_size(h)); @@ -171,13 +192,18 @@ full_search:  			 */  			if (start_addr != TASK_UNMAPPED_BASE) {  				start_addr = TASK_UNMAPPED_BASE; +				mm->cached_hole_size = 0;  				goto full_search;  			}  			return -ENOMEM;  		} -		if (!vma || addr + len <= vma->vm_start) +		if (!vma || addr + len <= vma->vm_start) { +			mm->free_area_cache = addr + len;  			return addr; +		} +		if (addr + mm->cached_hole_size < vma->vm_start) +			mm->cached_hole_size = vma->vm_start - addr;  		addr = ALIGN(vma->vm_end, huge_page_size(h));  	}  } @@ -238,17 +264,10 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,  	loff_t isize;  	ssize_t retval = 0; -	mutex_lock(&inode->i_mutex); -  	/* validate length */  	if (len == 0)  		goto out; -	isize = i_size_read(inode); -	if (!isize) -		goto out; - -	end_index = (isize - 1) >> huge_page_shift(h);  	for (;;) {  		struct page *page;  		unsigned long nr, ret; @@ -256,18 +275,21 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,  		/* nr is the maximum number of bytes to copy from this page */  		nr = huge_page_size(h); +		isize = i_size_read(inode); +		if (!isize) +			goto out; +		end_index = (isize - 1) >> huge_page_shift(h);  		if (index >= end_index) {  			if (index > end_index)  				goto out;  			nr = ((isize - 1) & ~huge_page_mask(h)) + 1; -			if (nr <= offset) { +			if (nr <= offset)  				goto out; -			}  		}  		nr = nr - offset;  		/* Find the page */ -		page = find_get_page(mapping, index); +		page = find_lock_page(mapping, index);  		if (unlikely(page == NULL)) {  			/*  			 * We have a HOLE, zero out the user-buffer for the @@ -279,17 +301,18 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,  			else  				ra = 0;  		} else { +			unlock_page(page); +  			/*  			 * We have the page, copy it to user space buffer.  			 */  			ra = hugetlbfs_read_actor(page, offset, buf, len, nr);  			ret = ra; +			page_cache_release(page);  		}  		if (ra < 0) {  			if (retval == 0)  				retval = ra; -			if (page) -				page_cache_release(page);  			goto out;  		} @@ -299,16 +322,12 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,  		index += offset >> huge_page_shift(h);  		offset &= ~huge_page_mask(h); -		if (page) -			page_cache_release(page); -  		/* short read or no more work */  		if ((ret != nr) || (len == 0))  			break;  	}  out:  	*ppos = ((loff_t)index << huge_page_shift(h)) + offset; -	mutex_unlock(&inode->i_mutex);  	return retval;  } @@ -607,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)  		spin_lock(&sbinfo->stat_lock);  		/* If no limits set, just report 0 for max/free/used  		 * blocks, like simple_statfs() */ -		if (sbinfo->max_blocks >= 0) { -			buf->f_blocks = sbinfo->max_blocks; -			buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; +		if (sbinfo->spool) { +			long free_pages; + +			spin_lock(&sbinfo->spool->lock); +			buf->f_blocks = sbinfo->spool->max_hpages; +			free_pages = sbinfo->spool->max_hpages +				- sbinfo->spool->used_hpages; +			buf->f_bavail = buf->f_bfree = free_pages; +			spin_unlock(&sbinfo->spool->lock);  			buf->f_files = sbinfo->max_inodes;  			buf->f_ffree = sbinfo->free_inodes;  		} @@ -625,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)  	if (sbi) {  		sb->s_fs_info = NULL; + +		if (sbi->spool) +			hugepage_put_subpool(sbi->spool); +  		kfree(sbi);  	}  } @@ -831,8 +860,6 @@ bad_val:  static int  hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)  { -	struct inode * inode; -	struct dentry * root;  	int ret;  	struct hugetlbfs_config config;  	struct hugetlbfs_sb_info *sbinfo; @@ -855,60 +882,31 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)  	sb->s_fs_info = sbinfo;  	sbinfo->hstate = config.hstate;  	spin_lock_init(&sbinfo->stat_lock); -	sbinfo->max_blocks = config.nr_blocks; -	sbinfo->free_blocks = config.nr_blocks;  	sbinfo->max_inodes = config.nr_inodes;  	sbinfo->free_inodes = config.nr_inodes; +	sbinfo->spool = NULL; +	if (config.nr_blocks != -1) { +		sbinfo->spool = hugepage_new_subpool(config.nr_blocks); +		if (!sbinfo->spool) +			goto out_free; +	}  	sb->s_maxbytes = MAX_LFS_FILESIZE;  	sb->s_blocksize = huge_page_size(config.hstate);  	sb->s_blocksize_bits = huge_page_shift(config.hstate);  	sb->s_magic = HUGETLBFS_MAGIC;  	sb->s_op = &hugetlbfs_ops;  	sb->s_time_gran = 1; -	inode = hugetlbfs_get_root(sb, &config); -	if (!inode) -		goto out_free; - -	root = d_alloc_root(inode); -	if (!root) { -		iput(inode); +	sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); +	if (!sb->s_root)  		goto out_free; -	} -	sb->s_root = root;  	return 0;  out_free: +	if (sbinfo->spool) +		kfree(sbinfo->spool);  	kfree(sbinfo);  	return -ENOMEM;  } -int hugetlb_get_quota(struct address_space *mapping, long delta) -{ -	int ret = 0; -	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); - -	if (sbinfo->free_blocks > -1) { -		spin_lock(&sbinfo->stat_lock); -		if (sbinfo->free_blocks - delta >= 0) -			sbinfo->free_blocks -= delta; -		else -			ret = -ENOMEM; -		spin_unlock(&sbinfo->stat_lock); -	} - -	return ret; -} - -void hugetlb_put_quota(struct address_space *mapping, long delta) -{ -	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); - -	if (sbinfo->free_blocks > -1) { -		spin_lock(&sbinfo->stat_lock); -		sbinfo->free_blocks += delta; -		spin_unlock(&sbinfo->stat_lock); -	} -} -  static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,  	int flags, const char *dev_name, void *data)  { @@ -928,8 +926,8 @@ static int can_do_hugetlb_shm(void)  	return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);  } -struct file *hugetlb_file_setup(const char *name, size_t size, -				vm_flags_t acctflag, +struct file *hugetlb_file_setup(const char *name, unsigned long addr, +				size_t size, vm_flags_t acctflag,  				struct user_struct **user, int creat_flags)  {  	int error = -ENOMEM; @@ -938,6 +936,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size,  	struct path path;  	struct dentry *root;  	struct qstr quick_string; +	struct hstate *hstate; +	unsigned long num_pages;  	*user = NULL;  	if (!hugetlbfs_vfsmount) @@ -946,7 +946,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size,  	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {  		*user = current_user();  		if (user_shm_lock(size, *user)) { -			printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n"); +			task_lock(current); +			printk_once(KERN_WARNING +				"%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", +				current->comm, current->pid); +			task_unlock(current);  		} else {  			*user = NULL;  			return ERR_PTR(-EPERM); @@ -967,10 +971,12 @@ struct file *hugetlb_file_setup(const char *name, size_t size,  	if (!inode)  		goto out_dentry; +	hstate = hstate_inode(inode); +	size += addr & ~huge_page_mask(hstate); +	num_pages = ALIGN(size, huge_page_size(hstate)) >> +			huge_page_shift(hstate);  	error = -ENOMEM; -	if (hugetlb_reserve_pages(inode, 0, -			size >> huge_page_shift(hstate_inode(inode)), NULL, -			acctflag)) +	if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))  		goto out_inode;  	d_instantiate(path.dentry, inode); @@ -1006,6 +1012,7 @@ static int __init init_hugetlbfs_fs(void)  	if (error)  		return error; +	error = -ENOMEM;  	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",  					sizeof(struct hugetlbfs_inode_info),  					0, 0, init_once); @@ -1024,10 +1031,10 @@ static int __init init_hugetlbfs_fs(void)  	}  	error = PTR_ERR(vfsmount); +	unregister_filesystem(&hugetlbfs_fs_type);   out: -	if (error) -		kmem_cache_destroy(hugetlbfs_inode_cachep); +	kmem_cache_destroy(hugetlbfs_inode_cachep);   out2:  	bdi_destroy(&hugetlbfs_backing_dev_info);  	return error; diff --git a/fs/inode.c b/fs/inode.c index fb10d86ffad..9f4f5fecc09 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2,29 +2,19 @@   * (C) 1997 Linus Torvalds   * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)   */ +#include <linux/export.h>  #include <linux/fs.h>  #include <linux/mm.h> -#include <linux/dcache.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/writeback.h> -#include <linux/module.h>  #include <linux/backing-dev.h> -#include <linux/wait.h> -#include <linux/rwsem.h>  #include <linux/hash.h>  #include <linux/swap.h>  #include <linux/security.h> -#include <linux/pagemap.h>  #include <linux/cdev.h>  #include <linux/bootmem.h>  #include <linux/fsnotify.h>  #include <linux/mount.h> -#include <linux/async.h>  #include <linux/posix_acl.h>  #include <linux/prefetch.h> -#include <linux/ima.h> -#include <linux/cred.h>  #include <linux/buffer_head.h> /* for inode_has_buffers */  #include <linux/ratelimit.h>  #include "internal.h" @@ -938,8 +928,7 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode)  		struct file_system_type *type = inode->i_sb->s_type;  		/* Set new key only if filesystem hasn't already changed it */ -		if (!lockdep_match_class(&inode->i_mutex, -		    &type->i_mutex_key)) { +		if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) {  			/*  			 * ensure nobody is actually holding i_mutex  			 */ @@ -966,6 +955,7 @@ void unlock_new_inode(struct inode *inode)  	spin_lock(&inode->i_lock);  	WARN_ON(!(inode->i_state & I_NEW));  	inode->i_state &= ~I_NEW; +	smp_mb();  	wake_up_bit(&inode->i_state, __I_NEW);  	spin_unlock(&inode->i_lock);  } @@ -1369,17 +1359,6 @@ int generic_delete_inode(struct inode *inode)  EXPORT_SYMBOL(generic_delete_inode);  /* - * Normal UNIX filesystem behaviour: delete the - * inode when the usage count drops to zero, and - * i_nlink is zero. - */ -int generic_drop_inode(struct inode *inode) -{ -	return !inode->i_nlink || inode_unhashed(inode); -} -EXPORT_SYMBOL_GPL(generic_drop_inode); - -/*   * Called when we're dropping the last reference   * to an inode.   * @@ -1510,9 +1489,10 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,   *	This function automatically handles read only file systems and media,   *	as well as the "noatime" flag and inode specific "noatime" markers.   */ -void touch_atime(struct vfsmount *mnt, struct dentry *dentry) +void touch_atime(struct path *path)  { -	struct inode *inode = dentry->d_inode; +	struct vfsmount *mnt = path->mnt; +	struct inode *inode = path->dentry->d_inode;  	struct timespec now;  	if (inode->i_flags & S_NOATIME) @@ -1651,7 +1631,7 @@ __setup("ihash_entries=", set_ihash_entries);   */  void __init inode_init_early(void)  { -	int loop; +	unsigned int loop;  	/* If hashes are distributed across NUMA nodes, defer  	 * hash allocation until vmalloc space is available. @@ -1669,13 +1649,13 @@ void __init inode_init_early(void)  					&i_hash_mask,  					0); -	for (loop = 0; loop < (1 << i_hash_shift); loop++) +	for (loop = 0; loop < (1U << i_hash_shift); loop++)  		INIT_HLIST_HEAD(&inode_hashtable[loop]);  }  void __init inode_init(void)  { -	int loop; +	unsigned int loop;  	/* inode slab cache */  	inode_cachep = kmem_cache_create("inode_cache", @@ -1699,7 +1679,7 @@ void __init inode_init(void)  					&i_hash_mask,  					0); -	for (loop = 0; loop < (1 << i_hash_shift); loop++) +	for (loop = 0; loop < (1U << i_hash_shift); loop++)  		INIT_HLIST_HEAD(&inode_hashtable[loop]);  } diff --git a/fs/ioctl.c b/fs/ioctl.c index 066836e8184..29167bebe87 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -10,7 +10,7 @@  #include <linux/file.h>  #include <linux/fs.h>  #include <linux/security.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/uaccess.h>  #include <linux/writeback.h>  #include <linux/buffer_head.h> diff --git a/fs/ioprio.c b/fs/ioprio.c index f84b380d65e..0f1b9515213 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)  	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);  	if (ioc) {  		ioc_ioprio_changed(ioc, ioprio); -		put_io_context(ioc, NULL); +		put_io_context(ioc);  	}  	return err; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index bd62c76fb5d..29037c365ba 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -947,9 +947,8 @@ root_found:  	s->s_d_op = &isofs_dentry_ops[table];  	/* get the root dentry */ -	s->s_root = d_alloc_root(inode); +	s->s_root = d_make_root(inode);  	if (!(s->s_root)) { -		iput(inode);  		error = -ENOMEM;  		goto out_no_inode;  	} diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 5d1a00a5041..05f0754f2b4 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -453,8 +453,6 @@ out:   *   * Return <0 on error, 0 on success, 1 if there was nothing to clean up.   * - * Called with the journal lock held. - *   * This is the only part of the journaling code which really needs to be   * aware of transaction aborts.  Checkpointing involves writing to the   * main filesystem area rather than to the journal, so it can proceed @@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal)  	if (is_journal_aborted(journal))  		return 1; -	/* OK, work out the oldest transaction remaining in the log, and +	/* +	 * OK, work out the oldest transaction remaining in the log, and  	 * the log block it starts at.  	 *  	 * If the log is now empty, we need to work out which is the  	 * next transaction ID we will write, and where it will -	 * start. */ - +	 * start. +	 */  	spin_lock(&journal->j_state_lock);  	spin_lock(&journal->j_list_lock);  	transaction = journal->j_checkpoint_transactions; @@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal)  		spin_unlock(&journal->j_state_lock);  		return 1;  	} +	spin_unlock(&journal->j_state_lock); + +	/* +	 * We need to make sure that any blocks that were recently written out +	 * --- perhaps by log_do_checkpoint() --- are flushed out before we +	 * drop the transactions from the journal. It's unlikely this will be +	 * necessary, especially with an appropriately sized journal, but we +	 * need this to guarantee correctness.  Fortunately +	 * cleanup_journal_tail() doesn't get called all that often. +	 */ +	if (journal->j_flags & JFS_BARRIER) +		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); +	spin_lock(&journal->j_state_lock); +	if (!tid_gt(first_tid, journal->j_tail_sequence)) { +		spin_unlock(&journal->j_state_lock); +		/* Someone else cleaned up journal so return 0 */ +		return 0; +	}  	/* OK, update the superblock to recover the freed space.  	 * Physical blocks come first: have we wrapped beyond the end of  	 * the log?  */ diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 59c09f9541b..0971e921780 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -129,6 +129,8 @@ static int kjournald(void *arg)  	setup_timer(&journal->j_commit_timer, commit_timeout,  			(unsigned long)current); +	set_freezable(); +  	/* Record that the journal thread is running */  	journal->j_task = current;  	wake_up(&journal->j_wait_done_commit); @@ -328,7 +330,7 @@ repeat:  		new_offset = offset_in_page(jh2bh(jh_in)->b_data);  	} -	mapped_data = kmap_atomic(new_page, KM_USER0); +	mapped_data = kmap_atomic(new_page);  	/*  	 * Check for escaping  	 */ @@ -337,7 +339,7 @@ repeat:  		need_copy_out = 1;  		do_escape = 1;  	} -	kunmap_atomic(mapped_data, KM_USER0); +	kunmap_atomic(mapped_data);  	/*  	 * Do we need to do a data copy? @@ -354,9 +356,9 @@ repeat:  		}  		jh_in->b_frozen_data = tmp; -		mapped_data = kmap_atomic(new_page, KM_USER0); +		mapped_data = kmap_atomic(new_page);  		memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); -		kunmap_atomic(mapped_data, KM_USER0); +		kunmap_atomic(mapped_data);  		new_page = virt_to_page(tmp);  		new_offset = offset_in_page(tmp); @@ -368,9 +370,9 @@ repeat:  	 * copying, we can finally do so.  	 */  	if (do_escape) { -		mapped_data = kmap_atomic(new_page, KM_USER0); +		mapped_data = kmap_atomic(new_page);  		*((unsigned int *)(mapped_data + new_offset)) = 0; -		kunmap_atomic(mapped_data, KM_USER0); +		kunmap_atomic(mapped_data);  	}  	set_bh_page(new_bh, new_page, new_offset); diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 5b43e96788e..008bf062fd2 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -20,6 +20,7 @@  #include <linux/fs.h>  #include <linux/jbd.h>  #include <linux/errno.h> +#include <linux/blkdev.h>  #endif  /* @@ -263,6 +264,9 @@ int journal_recover(journal_t *journal)  	err2 = sync_blockdev(journal->j_fs_dev);  	if (!err)  		err = err2; +	/* Flush disk caches to get replayed data on the permanent storage */ +	if (journal->j_flags & JFS_BARRIER) +		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);  	return err;  } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 7fce94b04bc..b2a7e5244e3 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -718,9 +718,9 @@ done:  			    "Possible IO failure.\n");  		page = jh2bh(jh)->b_page;  		offset = offset_in_page(jh2bh(jh)->b_data); -		source = kmap_atomic(page, KM_USER0); +		source = kmap_atomic(page);  		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); -		kunmap_atomic(source, KM_USER0); +		kunmap_atomic(source);  	}  	jbd_unlock_bh_state(bh); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index d49d202903f..c78841ee81c 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -88,14 +88,13 @@ static inline void __buffer_relink_io(struct journal_head *jh)   * whole transaction.   *   * Requires j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it   */  static int __try_to_free_cp_buf(struct journal_head *jh)  {  	int ret = 0;  	struct buffer_head *bh = jh2bh(jh); -	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && +	if (jh->b_transaction == NULL && !buffer_locked(bh) &&  	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {  		/*  		 * Get our reference so that bh cannot be freed before @@ -104,11 +103,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)  		get_bh(bh);  		JBUFFER_TRACE(jh, "remove from checkpoint list");  		ret = __jbd2_journal_remove_checkpoint(jh) + 1; -		jbd_unlock_bh_state(bh);  		BUFFER_TRACE(bh, "release");  		__brelse(bh); -	} else { -		jbd_unlock_bh_state(bh);  	}  	return ret;  } @@ -180,21 +176,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)  }  /* - * We were unable to perform jbd_trylock_bh_state() inside j_list_lock. - * The caller must restart a list walk.  Wait for someone else to run - * jbd_unlock_bh_state(). - */ -static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) -	__releases(journal->j_list_lock) -{ -	get_bh(bh); -	spin_unlock(&journal->j_list_lock); -	jbd_lock_bh_state(bh); -	jbd_unlock_bh_state(bh); -	put_bh(bh); -} - -/*   * Clean up transaction's list of buffers submitted for io.   * We wait for any pending IO to complete and remove any clean   * buffers. Note that we take the buffers in the opposite ordering @@ -222,15 +203,9 @@ restart:  	while (!released && transaction->t_checkpoint_io_list) {  		jh = transaction->t_checkpoint_io_list;  		bh = jh2bh(jh); -		if (!jbd_trylock_bh_state(bh)) { -			jbd_sync_bh(journal, bh); -			spin_lock(&journal->j_list_lock); -			goto restart; -		}  		get_bh(bh);  		if (buffer_locked(bh)) {  			spin_unlock(&journal->j_list_lock); -			jbd_unlock_bh_state(bh);  			wait_on_buffer(bh);  			/* the journal_head may have gone by now */  			BUFFER_TRACE(bh, "brelse"); @@ -246,7 +221,6 @@ restart:  		 * it has been written out and so we can drop it from the list  		 */  		released = __jbd2_journal_remove_checkpoint(jh); -		jbd_unlock_bh_state(bh);  		__brelse(bh);  	} @@ -266,7 +240,6 @@ __flush_batch(journal_t *journal, int *batch_count)  	for (i = 0; i < *batch_count; i++) {  		struct buffer_head *bh = journal->j_chkpt_bhs[i]; -		clear_buffer_jwrite(bh);  		BUFFER_TRACE(bh, "brelse");  		__brelse(bh);  	} @@ -281,7 +254,6 @@ __flush_batch(journal_t *journal, int *batch_count)   * be written out.   *   * Called with j_list_lock held and drops it if 1 is returned - * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it   */  static int __process_buffer(journal_t *journal, struct journal_head *jh,  			    int *batch_count, transaction_t *transaction) @@ -292,7 +264,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,  	if (buffer_locked(bh)) {  		get_bh(bh);  		spin_unlock(&journal->j_list_lock); -		jbd_unlock_bh_state(bh);  		wait_on_buffer(bh);  		/* the journal_head may have gone by now */  		BUFFER_TRACE(bh, "brelse"); @@ -304,7 +275,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,  		transaction->t_chp_stats.cs_forced_to_close++;  		spin_unlock(&journal->j_list_lock); -		jbd_unlock_bh_state(bh);  		if (unlikely(journal->j_flags & JBD2_UNMOUNT))  			/*  			 * The journal thread is dead; so starting and @@ -323,11 +293,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,  		if (unlikely(buffer_write_io_error(bh)))  			ret = -EIO;  		get_bh(bh); -		J_ASSERT_JH(jh, !buffer_jbddirty(bh));  		BUFFER_TRACE(bh, "remove from checkpoint");  		__jbd2_journal_remove_checkpoint(jh);  		spin_unlock(&journal->j_list_lock); -		jbd_unlock_bh_state(bh);  		__brelse(bh);  	} else {  		/* @@ -340,10 +308,8 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,  		BUFFER_TRACE(bh, "queue");  		get_bh(bh);  		J_ASSERT_BH(bh, !buffer_jwrite(bh)); -		set_buffer_jwrite(bh);  		journal->j_chkpt_bhs[*batch_count] = bh;  		__buffer_relink_io(jh); -		jbd_unlock_bh_state(bh);  		transaction->t_chp_stats.cs_written++;  		(*batch_count)++;  		if (*batch_count == JBD2_NR_BATCH) { @@ -407,15 +373,7 @@ restart:  		int retry = 0, err;  		while (!retry && transaction->t_checkpoint_list) { -			struct buffer_head *bh; -  			jh = transaction->t_checkpoint_list; -			bh = jh2bh(jh); -			if (!jbd_trylock_bh_state(bh)) { -				jbd_sync_bh(journal, bh); -				retry = 1; -				break; -			}  			retry = __process_buffer(journal, jh, &batch_count,  						 transaction);  			if (retry < 0 && !result) @@ -478,79 +436,28 @@ out:  int jbd2_cleanup_journal_tail(journal_t *journal)  { -	transaction_t * transaction;  	tid_t		first_tid; -	unsigned long	blocknr, freed; +	unsigned long	blocknr;  	if (is_journal_aborted(journal))  		return 1; -	/* OK, work out the oldest transaction remaining in the log, and -	 * the log block it starts at. -	 * -	 * If the log is now empty, we need to work out which is the -	 * next transaction ID we will write, and where it will -	 * start. */ - -	write_lock(&journal->j_state_lock); -	spin_lock(&journal->j_list_lock); -	transaction = journal->j_checkpoint_transactions; -	if (transaction) { -		first_tid = transaction->t_tid; -		blocknr = transaction->t_log_start; -	} else if ((transaction = journal->j_committing_transaction) != NULL) { -		first_tid = transaction->t_tid; -		blocknr = transaction->t_log_start; -	} else if ((transaction = journal->j_running_transaction) != NULL) { -		first_tid = transaction->t_tid; -		blocknr = journal->j_head; -	} else { -		first_tid = journal->j_transaction_sequence; -		blocknr = journal->j_head; -	} -	spin_unlock(&journal->j_list_lock); -	J_ASSERT(blocknr != 0); - -	/* If the oldest pinned transaction is at the tail of the log -           already then there's not much we can do right now. */ -	if (journal->j_tail_sequence == first_tid) { -		write_unlock(&journal->j_state_lock); +	if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))  		return 1; -	} - -	/* OK, update the superblock to recover the freed space. -	 * Physical blocks come first: have we wrapped beyond the end of -	 * the log?  */ -	freed = blocknr - journal->j_tail; -	if (blocknr < journal->j_tail) -		freed = freed + journal->j_last - journal->j_first; - -	trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed); -	jbd_debug(1, -		  "Cleaning journal tail from %d to %d (offset %lu), " -		  "freeing %lu\n", -		  journal->j_tail_sequence, first_tid, blocknr, freed); - -	journal->j_free += freed; -	journal->j_tail_sequence = first_tid; -	journal->j_tail = blocknr; -	write_unlock(&journal->j_state_lock); +	J_ASSERT(blocknr != 0);  	/* -	 * If there is an external journal, we need to make sure that -	 * any data blocks that were recently written out --- perhaps -	 * by jbd2_log_do_checkpoint() --- are flushed out before we -	 * drop the transactions from the external journal.  It's -	 * unlikely this will be necessary, especially with a -	 * appropriately sized journal, but we need this to guarantee -	 * correctness.  Fortunately jbd2_cleanup_journal_tail() -	 * doesn't get called all that often. +	 * We need to make sure that any blocks that were recently written out +	 * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before +	 * we drop the transactions from the journal. It's unlikely this will +	 * be necessary, especially with an appropriately sized journal, but we +	 * need this to guarantee correctness.  Fortunately +	 * jbd2_cleanup_journal_tail() doesn't get called all that often.  	 */ -	if ((journal->j_fs_dev != journal->j_dev) && -	    (journal->j_flags & JBD2_BARRIER)) +	if (journal->j_flags & JBD2_BARRIER)  		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); -	if (!(journal->j_flags & JBD2_ABORT)) -		jbd2_journal_update_superblock(journal, 1); + +	__jbd2_update_log_tail(journal, first_tid, blocknr);  	return 0;  } @@ -582,15 +489,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)  	do {  		jh = next_jh;  		next_jh = jh->b_cpnext; -		/* Use trylock because of the ranking */ -		if (jbd_trylock_bh_state(jh2bh(jh))) { -			ret = __try_to_free_cp_buf(jh); -			if (ret) { -				freed++; -				if (ret == 2) { -					*released = 1; -					return freed; -				} +		ret = __try_to_free_cp_buf(jh); +		if (ret) { +			freed++; +			if (ret == 2) { +				*released = 1; +				return freed;  			}  		}  		/* @@ -673,9 +577,7 @@ out:   * The function can free jh and bh.   *   * This function is called with j_list_lock held. - * This function is called with jbd_lock_bh_state(jh2bh(jh))   */ -  int __jbd2_journal_remove_checkpoint(struct journal_head *jh)  {  	struct transaction_chp_stats_s *stats; @@ -722,7 +624,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)  				    transaction->t_tid, stats);  	__jbd2_journal_drop_transaction(journal, transaction); -	kfree(transaction); +	jbd2_journal_free_transaction(transaction);  	/* Just in case anybody was waiting for more transactions to be             checkpointed... */ @@ -797,5 +699,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact  	J_ASSERT(journal->j_committing_transaction != transaction);  	J_ASSERT(journal->j_running_transaction != transaction); +	trace_jbd2_drop_transaction(journal, transaction); +  	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);  } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5069b847515..17f557f01cf 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -286,10 +286,10 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)  	char *addr;  	__u32 checksum; -	addr = kmap_atomic(page, KM_USER0); +	addr = kmap_atomic(page);  	checksum = crc32_be(crc32_sum,  		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size); -	kunmap_atomic(addr, KM_USER0); +	kunmap_atomic(addr);  	return checksum;  } @@ -331,6 +331,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)  	struct buffer_head *cbh = NULL; /* For transactional checksums */  	__u32 crc32_sum = ~0;  	struct blk_plug plug; +	/* Tail of the journal */ +	unsigned long first_block; +	tid_t first_tid; +	int update_tail;  	/*  	 * First job: lock down the current transaction and wait for @@ -340,7 +344,18 @@ void jbd2_journal_commit_transaction(journal_t *journal)  	/* Do we need to erase the effects of a prior jbd2_journal_flush? */  	if (journal->j_flags & JBD2_FLUSHED) {  		jbd_debug(3, "super block updated\n"); -		jbd2_journal_update_superblock(journal, 1); +		mutex_lock(&journal->j_checkpoint_mutex); +		/* +		 * We hold j_checkpoint_mutex so tail cannot change under us. +		 * We don't need any special data guarantees for writing sb +		 * since journal is empty and it is ok for write to be +		 * flushed only with transaction commit. +		 */ +		jbd2_journal_update_sb_log_tail(journal, +						journal->j_tail_sequence, +						journal->j_tail, +						WRITE_SYNC); +		mutex_unlock(&journal->j_checkpoint_mutex);  	} else {  		jbd_debug(3, "superblock not updated\n");  	} @@ -677,10 +692,30 @@ start_journal_io:  		err = 0;  	} +	/* +	 * Get current oldest transaction in the log before we issue flush +	 * to the filesystem device. After the flush we can be sure that +	 * blocks of all older transactions are checkpointed to persistent +	 * storage and we will be safe to update journal start in the +	 * superblock with the numbers we get here. +	 */ +	update_tail = +		jbd2_journal_get_log_tail(journal, &first_tid, &first_block); +  	write_lock(&journal->j_state_lock); +	if (update_tail) { +		long freed = first_block - journal->j_tail; + +		if (first_block < journal->j_tail) +			freed += journal->j_last - journal->j_first; +		/* Update tail only if we free significant amount of space */ +		if (freed < journal->j_maxlen / 4) +			update_tail = 0; +	}  	J_ASSERT(commit_transaction->t_state == T_COMMIT);  	commit_transaction->t_state = T_COMMIT_DFLUSH;  	write_unlock(&journal->j_state_lock); +  	/*   	 * If the journal is not located on the file system device,  	 * then we must flush the file system device before we issue @@ -831,6 +866,14 @@ wait_for_iobuf:  	if (err)  		jbd2_journal_abort(journal, err); +	/* +	 * Now disk caches for filesystem device are flushed so we are safe to +	 * erase checkpointed transactions from the log by updating journal +	 * superblock. +	 */ +	if (update_tail) +		jbd2_update_log_tail(journal, first_tid, first_block); +  	/* End of a transaction!  Finally, we can do checkpoint             processing: any buffers committed as a result of this             transaction can be removed from any checkpoint list it was on @@ -1048,7 +1091,7 @@ restart_loop:  	jbd_debug(1, "JBD2: commit %d complete, head %d\n",  		  journal->j_commit_sequence, journal->j_tail_sequence);  	if (to_free) -		kfree(commit_transaction); +		jbd2_journal_free_transaction(commit_transaction);  	wake_up(&journal->j_wait_done_commit);  } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c0a5f9f1b12..98ed6dbfe38 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -71,7 +71,6 @@ EXPORT_SYMBOL(jbd2_journal_revoke);  EXPORT_SYMBOL(jbd2_journal_init_dev);  EXPORT_SYMBOL(jbd2_journal_init_inode); -EXPORT_SYMBOL(jbd2_journal_update_format);  EXPORT_SYMBOL(jbd2_journal_check_used_features);  EXPORT_SYMBOL(jbd2_journal_check_available_features);  EXPORT_SYMBOL(jbd2_journal_set_features); @@ -96,7 +95,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);  EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);  EXPORT_SYMBOL(jbd2_inode_cache); -static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);  static void __journal_abort_soft (journal_t *journal, int errno);  static int jbd2_journal_create_slab(size_t slab_size); @@ -139,6 +137,8 @@ static int kjournald2(void *arg)  	setup_timer(&journal->j_commit_timer, commit_timeout,  			(unsigned long)current); +	set_freezable(); +  	/* Record that the journal thread is running */  	journal->j_task = current;  	wake_up(&journal->j_wait_done_commit); @@ -345,7 +345,7 @@ repeat:  		new_offset = offset_in_page(jh2bh(jh_in)->b_data);  	} -	mapped_data = kmap_atomic(new_page, KM_USER0); +	mapped_data = kmap_atomic(new_page);  	/*  	 * Fire data frozen trigger if data already wasn't frozen.  Do this  	 * before checking for escaping, as the trigger may modify the magic @@ -364,7 +364,7 @@ repeat:  		need_copy_out = 1;  		do_escape = 1;  	} -	kunmap_atomic(mapped_data, KM_USER0); +	kunmap_atomic(mapped_data);  	/*  	 * Do we need to do a data copy? @@ -385,9 +385,9 @@ repeat:  		}  		jh_in->b_frozen_data = tmp; -		mapped_data = kmap_atomic(new_page, KM_USER0); +		mapped_data = kmap_atomic(new_page);  		memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); -		kunmap_atomic(mapped_data, KM_USER0); +		kunmap_atomic(mapped_data);  		new_page = virt_to_page(tmp);  		new_offset = offset_in_page(tmp); @@ -406,9 +406,9 @@ repeat:  	 * copying, we can finally do so.  	 */  	if (do_escape) { -		mapped_data = kmap_atomic(new_page, KM_USER0); +		mapped_data = kmap_atomic(new_page);  		*((unsigned int *)(mapped_data + new_offset)) = 0; -		kunmap_atomic(mapped_data, KM_USER0); +		kunmap_atomic(mapped_data);  	}  	set_bh_page(new_bh, new_page, new_offset); @@ -744,6 +744,98 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)  	return jbd2_journal_add_journal_head(bh);  } +/* + * Return tid of the oldest transaction in the journal and block in the journal + * where the transaction starts. + * + * If the journal is now empty, return which will be the next transaction ID + * we will write and where will that transaction start. + * + * The return value is 0 if journal tail cannot be pushed any further, 1 if + * it can. + */ +int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, +			      unsigned long *block) +{ +	transaction_t *transaction; +	int ret; + +	read_lock(&journal->j_state_lock); +	spin_lock(&journal->j_list_lock); +	transaction = journal->j_checkpoint_transactions; +	if (transaction) { +		*tid = transaction->t_tid; +		*block = transaction->t_log_start; +	} else if ((transaction = journal->j_committing_transaction) != NULL) { +		*tid = transaction->t_tid; +		*block = transaction->t_log_start; +	} else if ((transaction = journal->j_running_transaction) != NULL) { +		*tid = transaction->t_tid; +		*block = journal->j_head; +	} else { +		*tid = journal->j_transaction_sequence; +		*block = journal->j_head; +	} +	ret = tid_gt(*tid, journal->j_tail_sequence); +	spin_unlock(&journal->j_list_lock); +	read_unlock(&journal->j_state_lock); + +	return ret; +} + +/* + * Update information in journal structure and in on disk journal superblock + * about log tail. This function does not check whether information passed in + * really pushes log tail further. It's responsibility of the caller to make + * sure provided log tail information is valid (e.g. by holding + * j_checkpoint_mutex all the time between computing log tail and calling this + * function as is the case with jbd2_cleanup_journal_tail()). + * + * Requires j_checkpoint_mutex + */ +void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ +	unsigned long freed; + +	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + +	/* +	 * We cannot afford for write to remain in drive's caches since as +	 * soon as we update j_tail, next transaction can start reusing journal +	 * space and if we lose sb update during power failure we'd replay +	 * old transaction with possibly newly overwritten data. +	 */ +	jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); +	write_lock(&journal->j_state_lock); +	freed = block - journal->j_tail; +	if (block < journal->j_tail) +		freed += journal->j_last - journal->j_first; + +	trace_jbd2_update_log_tail(journal, tid, block, freed); +	jbd_debug(1, +		  "Cleaning journal tail from %d to %d (offset %lu), " +		  "freeing %lu\n", +		  journal->j_tail_sequence, tid, block, freed); + +	journal->j_free += freed; +	journal->j_tail_sequence = tid; +	journal->j_tail = block; +	write_unlock(&journal->j_state_lock); +} + +/* + * This is a variaon of __jbd2_update_log_tail which checks for validity of + * provided log tail and locks j_checkpoint_mutex. So it is safe against races + * with other threads updating log tail. + */ +void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +{ +	mutex_lock(&journal->j_checkpoint_mutex); +	if (tid_gt(tid, journal->j_tail_sequence)) +		__jbd2_update_log_tail(journal, tid, block); +	mutex_unlock(&journal->j_checkpoint_mutex); +} +  struct jbd2_stats_proc_session {  	journal_t *journal;  	struct transaction_stats_s *stats; @@ -1112,40 +1204,45 @@ static int journal_reset(journal_t *journal)  	journal->j_max_transaction_buffers = journal->j_maxlen / 4; -	/* Add the dynamic fields and write it to disk. */ -	jbd2_journal_update_superblock(journal, 1); -	return jbd2_journal_start_thread(journal); -} - -/** - * void jbd2_journal_update_superblock() - Update journal sb on disk. - * @journal: The journal to update. - * @wait: Set to '0' if you don't want to wait for IO completion. - * - * Update a journal's dynamic superblock fields and write it to disk, - * optionally waiting for the IO to complete. - */ -void jbd2_journal_update_superblock(journal_t *journal, int wait) -{ -	journal_superblock_t *sb = journal->j_superblock; -	struct buffer_head *bh = journal->j_sb_buffer; -  	/*  	 * As a special case, if the on-disk copy is already marked as needing -	 * no recovery (s_start == 0) and there are no outstanding transactions -	 * in the filesystem, then we can safely defer the superblock update -	 * until the next commit by setting JBD2_FLUSHED.  This avoids +	 * no recovery (s_start == 0), then we can safely defer the superblock +	 * update until the next commit by setting JBD2_FLUSHED.  This avoids  	 * attempting a write to a potential-readonly device.  	 */ -	if (sb->s_start == 0 && journal->j_tail_sequence == -				journal->j_transaction_sequence) { +	if (sb->s_start == 0) {  		jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "  			"(start %ld, seq %d, errno %d)\n",  			journal->j_tail, journal->j_tail_sequence,  			journal->j_errno); -		goto out; +		journal->j_flags |= JBD2_FLUSHED; +	} else { +		/* Lock here to make assertions happy... */ +		mutex_lock(&journal->j_checkpoint_mutex); +		/* +		 * Update log tail information. We use WRITE_FUA since new +		 * transaction will start reusing journal space and so we +		 * must make sure information about current log tail is on +		 * disk before that. +		 */ +		jbd2_journal_update_sb_log_tail(journal, +						journal->j_tail_sequence, +						journal->j_tail, +						WRITE_FUA); +		mutex_unlock(&journal->j_checkpoint_mutex);  	} +	return jbd2_journal_start_thread(journal); +} +static void jbd2_write_superblock(journal_t *journal, int write_op) +{ +	struct buffer_head *bh = journal->j_sb_buffer; +	int ret; + +	trace_jbd2_write_superblock(journal, write_op); +	if (!(journal->j_flags & JBD2_BARRIER)) +		write_op &= ~(REQ_FUA | REQ_FLUSH); +	lock_buffer(bh);  	if (buffer_write_io_error(bh)) {  		/*  		 * Oh, dear.  A previous attempt to write the journal @@ -1161,48 +1258,106 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)  		clear_buffer_write_io_error(bh);  		set_buffer_uptodate(bh);  	} +	get_bh(bh); +	bh->b_end_io = end_buffer_write_sync; +	ret = submit_bh(write_op, bh); +	wait_on_buffer(bh); +	if (buffer_write_io_error(bh)) { +		clear_buffer_write_io_error(bh); +		set_buffer_uptodate(bh); +		ret = -EIO; +	} +	if (ret) { +		printk(KERN_ERR "JBD2: Error %d detected when updating " +		       "journal superblock for %s.\n", ret, +		       journal->j_devname); +	} +} +/** + * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, +				     unsigned long tail_block, int write_op) +{ +	journal_superblock_t *sb = journal->j_superblock; + +	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); +	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", +		  tail_block, tail_tid); + +	sb->s_sequence = cpu_to_be32(tail_tid); +	sb->s_start    = cpu_to_be32(tail_block); + +	jbd2_write_superblock(journal, write_op); + +	/* Log is no longer empty */ +	write_lock(&journal->j_state_lock); +	WARN_ON(!sb->s_sequence); +	journal->j_flags &= ~JBD2_FLUSHED; +	write_unlock(&journal->j_state_lock); +} + +/** + * jbd2_mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void jbd2_mark_journal_empty(journal_t *journal) +{ +	journal_superblock_t *sb = journal->j_superblock; + +	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));  	read_lock(&journal->j_state_lock); -	jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n", -		  journal->j_tail, journal->j_tail_sequence, journal->j_errno); +	jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", +		  journal->j_tail_sequence);  	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); -	sb->s_start    = cpu_to_be32(journal->j_tail); -	sb->s_errno    = cpu_to_be32(journal->j_errno); +	sb->s_start    = cpu_to_be32(0);  	read_unlock(&journal->j_state_lock); -	BUFFER_TRACE(bh, "marking dirty"); -	mark_buffer_dirty(bh); -	if (wait) { -		sync_dirty_buffer(bh); -		if (buffer_write_io_error(bh)) { -			printk(KERN_ERR "JBD2: I/O error detected " -			       "when updating journal superblock for %s.\n", -			       journal->j_devname); -			clear_buffer_write_io_error(bh); -			set_buffer_uptodate(bh); -		} -	} else -		write_dirty_buffer(bh, WRITE); - -out: -	/* If we have just flushed the log (by marking s_start==0), then -	 * any future commit will have to be careful to update the -	 * superblock again to re-record the true start of the log. */ +	jbd2_write_superblock(journal, WRITE_FUA); +	/* Log is no longer empty */  	write_lock(&journal->j_state_lock); -	if (sb->s_start) -		journal->j_flags &= ~JBD2_FLUSHED; -	else -		journal->j_flags |= JBD2_FLUSHED; +	journal->j_flags |= JBD2_FLUSHED;  	write_unlock(&journal->j_state_lock);  } + +/** + * jbd2_journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno.  Write updated superblock to disk waiting for IO + * to complete. + */ +static void jbd2_journal_update_sb_errno(journal_t *journal) +{ +	journal_superblock_t *sb = journal->j_superblock; + +	read_lock(&journal->j_state_lock); +	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", +		  journal->j_errno); +	sb->s_errno    = cpu_to_be32(journal->j_errno); +	read_unlock(&journal->j_state_lock); + +	jbd2_write_superblock(journal, WRITE_SYNC); +} +  /*   * Read the superblock for a given journal, performing initial   * validation of the format.   */ -  static int journal_get_superblock(journal_t *journal)  {  	struct buffer_head *bh; @@ -1396,14 +1551,11 @@ int jbd2_journal_destroy(journal_t *journal)  	if (journal->j_sb_buffer) {  		if (!is_journal_aborted(journal)) { -			/* We can now mark the journal as empty. */ -			journal->j_tail = 0; -			journal->j_tail_sequence = -				++journal->j_transaction_sequence; -			jbd2_journal_update_superblock(journal, 1); -		} else { +			mutex_lock(&journal->j_checkpoint_mutex); +			jbd2_mark_journal_empty(journal); +			mutex_unlock(&journal->j_checkpoint_mutex); +		} else  			err = -EIO; -		}  		brelse(journal->j_sb_buffer);  	} @@ -1550,61 +1702,6 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,  EXPORT_SYMBOL(jbd2_journal_clear_features);  /** - * int jbd2_journal_update_format () - Update on-disk journal structure. - * @journal: Journal to act on. - * - * Given an initialised but unloaded journal struct, poke about in the - * on-disk structure to update it to the most recent supported version. - */ -int jbd2_journal_update_format (journal_t *journal) -{ -	journal_superblock_t *sb; -	int err; - -	err = journal_get_superblock(journal); -	if (err) -		return err; - -	sb = journal->j_superblock; - -	switch (be32_to_cpu(sb->s_header.h_blocktype)) { -	case JBD2_SUPERBLOCK_V2: -		return 0; -	case JBD2_SUPERBLOCK_V1: -		return journal_convert_superblock_v1(journal, sb); -	default: -		break; -	} -	return -EINVAL; -} - -static int journal_convert_superblock_v1(journal_t *journal, -					 journal_superblock_t *sb) -{ -	int offset, blocksize; -	struct buffer_head *bh; - -	printk(KERN_WARNING -		"JBD2: Converting superblock from version 1 to 2.\n"); - -	/* Pre-initialise new fields to zero */ -	offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); -	blocksize = be32_to_cpu(sb->s_blocksize); -	memset(&sb->s_feature_compat, 0, blocksize-offset); - -	sb->s_nr_users = cpu_to_be32(1); -	sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2); -	journal->j_format_version = 2; - -	bh = journal->j_sb_buffer; -	BUFFER_TRACE(bh, "marking dirty"); -	mark_buffer_dirty(bh); -	sync_dirty_buffer(bh); -	return 0; -} - - -/**   * int jbd2_journal_flush () - Flush journal   * @journal: Journal to act on.   * @@ -1617,7 +1714,6 @@ int jbd2_journal_flush(journal_t *journal)  {  	int err = 0;  	transaction_t *transaction = NULL; -	unsigned long old_tail;  	write_lock(&journal->j_state_lock); @@ -1652,6 +1748,7 @@ int jbd2_journal_flush(journal_t *journal)  	if (is_journal_aborted(journal))  		return -EIO; +	mutex_lock(&journal->j_checkpoint_mutex);  	jbd2_cleanup_journal_tail(journal);  	/* Finally, mark the journal as really needing no recovery. @@ -1659,14 +1756,9 @@ int jbd2_journal_flush(journal_t *journal)  	 * the magic code for a fully-recovered superblock.  Any future  	 * commits of data to the journal will restore the current  	 * s_start value. */ +	jbd2_mark_journal_empty(journal); +	mutex_unlock(&journal->j_checkpoint_mutex);  	write_lock(&journal->j_state_lock); -	old_tail = journal->j_tail; -	journal->j_tail = 0; -	write_unlock(&journal->j_state_lock); -	jbd2_journal_update_superblock(journal, 1); -	write_lock(&journal->j_state_lock); -	journal->j_tail = old_tail; -  	J_ASSERT(!journal->j_running_transaction);  	J_ASSERT(!journal->j_committing_transaction);  	J_ASSERT(!journal->j_checkpoint_transactions); @@ -1706,8 +1798,12 @@ int jbd2_journal_wipe(journal_t *journal, int write)  		write ? "Clearing" : "Ignoring");  	err = jbd2_journal_skip_recovery(journal); -	if (write) -		jbd2_journal_update_superblock(journal, 1); +	if (write) { +		/* Lock to make assertions happy... */ +		mutex_lock(&journal->j_checkpoint_mutex); +		jbd2_mark_journal_empty(journal); +		mutex_unlock(&journal->j_checkpoint_mutex); +	}   no_recovery:  	return err; @@ -1757,7 +1853,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)  	__jbd2_journal_abort_hard(journal);  	if (errno) -		jbd2_journal_update_superblock(journal, 1); +		jbd2_journal_update_sb_errno(journal);  }  /** @@ -2015,7 +2111,7 @@ static struct kmem_cache *jbd2_journal_head_cache;  static atomic_t nr_journal_heads = ATOMIC_INIT(0);  #endif -static int journal_init_jbd2_journal_head_cache(void) +static int jbd2_journal_init_journal_head_cache(void)  {  	int retval; @@ -2033,7 +2129,7 @@ static int journal_init_jbd2_journal_head_cache(void)  	return retval;  } -static void jbd2_journal_destroy_jbd2_journal_head_cache(void) +static void jbd2_journal_destroy_journal_head_cache(void)  {  	if (jbd2_journal_head_cache) {  		kmem_cache_destroy(jbd2_journal_head_cache); @@ -2321,7 +2417,7 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)  struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; -static int __init journal_init_handle_cache(void) +static int __init jbd2_journal_init_handle_cache(void)  {  	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);  	if (jbd2_handle_cache == NULL) { @@ -2356,17 +2452,20 @@ static int __init journal_init_caches(void)  	ret = jbd2_journal_init_revoke_caches();  	if (ret == 0) -		ret = journal_init_jbd2_journal_head_cache(); +		ret = jbd2_journal_init_journal_head_cache(); +	if (ret == 0) +		ret = jbd2_journal_init_handle_cache();  	if (ret == 0) -		ret = journal_init_handle_cache(); +		ret = jbd2_journal_init_transaction_cache();  	return ret;  }  static void jbd2_journal_destroy_caches(void)  {  	jbd2_journal_destroy_revoke_caches(); -	jbd2_journal_destroy_jbd2_journal_head_cache(); +	jbd2_journal_destroy_journal_head_cache();  	jbd2_journal_destroy_handle_cache(); +	jbd2_journal_destroy_transaction_cache();  	jbd2_journal_destroy_slabs();  } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index da6d7baf139..c1a03354a22 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -21,6 +21,7 @@  #include <linux/jbd2.h>  #include <linux/errno.h>  #include <linux/crc32.h> +#include <linux/blkdev.h>  #endif  /* @@ -265,7 +266,9 @@ int jbd2_journal_recover(journal_t *journal)  	err2 = sync_blockdev(journal->j_fs_dev);  	if (!err)  		err = err2; - +	/* Make sure all replayed data is on permanent storage */ +	if (journal->j_flags & JBD2_BARRIER) +		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);  	return err;  } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 30b2867d6cc..6973705d6a3 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -208,17 +208,13 @@ int __init jbd2_journal_init_revoke_caches(void)  	J_ASSERT(!jbd2_revoke_record_cache);  	J_ASSERT(!jbd2_revoke_table_cache); -	jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", -					   sizeof(struct jbd2_revoke_record_s), -					   0, -					   SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, -					   NULL); +	jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, +					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);  	if (!jbd2_revoke_record_cache)  		goto record_cache_failure; -	jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", -					   sizeof(struct jbd2_revoke_table_s), -					   0, SLAB_TEMPORARY, NULL); +	jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, +					     SLAB_TEMPORARY);  	if (!jbd2_revoke_table_cache)  		goto table_cache_failure;  	return 0; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 35ae096bed5..ddcd3549c6c 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -33,6 +33,35 @@  static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);  static void __jbd2_journal_unfile_buffer(struct journal_head *jh); +static struct kmem_cache *transaction_cache; +int __init jbd2_journal_init_transaction_cache(void) +{ +	J_ASSERT(!transaction_cache); +	transaction_cache = kmem_cache_create("jbd2_transaction_s", +					sizeof(transaction_t), +					0, +					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, +					NULL); +	if (transaction_cache) +		return 0; +	return -ENOMEM; +} + +void jbd2_journal_destroy_transaction_cache(void) +{ +	if (transaction_cache) { +		kmem_cache_destroy(transaction_cache); +		transaction_cache = NULL; +	} +} + +void jbd2_journal_free_transaction(transaction_t *transaction) +{ +	if (unlikely(ZERO_OR_NULL_PTR(transaction))) +		return; +	kmem_cache_free(transaction_cache, transaction); +} +  /*   * jbd2_get_transaction: obtain a new transaction_t object.   * @@ -133,7 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,  alloc_transaction:  	if (!journal->j_running_transaction) { -		new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); +		new_transaction = kmem_cache_alloc(transaction_cache, +						   gfp_mask | __GFP_ZERO);  		if (!new_transaction) {  			/*  			 * If __GFP_FS is not present, then we may be @@ -162,7 +192,7 @@ repeat:  	if (is_journal_aborted(journal) ||  	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {  		read_unlock(&journal->j_state_lock); -		kfree(new_transaction); +		jbd2_journal_free_transaction(new_transaction);  		return -EROFS;  	} @@ -284,7 +314,7 @@ repeat:  	read_unlock(&journal->j_state_lock);  	lock_map_acquire(&handle->h_lockdep_map); -	kfree(new_transaction); +	jbd2_journal_free_transaction(new_transaction);  	return 0;  } @@ -783,12 +813,12 @@ done:  			    "Possible IO failure.\n");  		page = jh2bh(jh)->b_page;  		offset = offset_in_page(jh2bh(jh)->b_data); -		source = kmap_atomic(page, KM_USER0); +		source = kmap_atomic(page);  		/* Fire data frozen trigger just before we copy the data */  		jbd2_buffer_frozen_trigger(jh, source + offset,  					   jh->b_triggers);  		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); -		kunmap_atomic(source, KM_USER0); +		kunmap_atomic(source);  		/*  		 * Now that the frozen data is saved off, we need to store @@ -1549,9 +1579,9 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)   * of these pointers, it could go bad.  Generally the caller needs to re-read   * the pointer from the transaction_t.   * - * Called under j_list_lock.  The journal may not be locked. + * Called under j_list_lock.   */ -void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) +static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)  {  	struct journal_head **list = NULL;  	transaction_t *transaction; @@ -1646,10 +1676,8 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)  	spin_lock(&journal->j_list_lock);  	if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {  		/* written-back checkpointed metadata buffer */ -		if (jh->b_jlist == BJ_None) { -			JBUFFER_TRACE(jh, "remove from checkpoint list"); -			__jbd2_journal_remove_checkpoint(jh); -		} +		JBUFFER_TRACE(jh, "remove from checkpoint list"); +		__jbd2_journal_remove_checkpoint(jh);  	}  	spin_unlock(&journal->j_list_lock);  out: @@ -1949,6 +1977,8 @@ zap_buffer_unlocked:  	clear_buffer_mapped(bh);  	clear_buffer_req(bh);  	clear_buffer_new(bh); +	clear_buffer_delay(bh); +	clear_buffer_unwritten(bh);  	bh->b_bdev = NULL;  	return may_free;  } diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index 5b6c9d1a2fb..96ed3c9ec3f 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -340,7 +340,7 @@ int jffs2_unregister_compressor(struct jffs2_compressor *comp)  	if (comp->usecount) {  		spin_unlock(&jffs2_compressor_list_lock); -		printk(KERN_WARNING "JFFS2: Compressor modul is in use. Unregister failed.\n"); +		printk(KERN_WARNING "JFFS2: Compressor module is in use. Unregister failed.\n");  		return -1;  	}  	list_del(&comp->list); diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index a01cdad6aad..eafb8d37a6f 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -335,7 +335,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl  	void *ebuf;  	uint32_t ofs;  	size_t retlen; -	int ret = -EIO; +	int ret;  	unsigned long *wordebuf;  	ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen, diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2e0123867cb..c0d5c9d770d 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -561,9 +561,9 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)  	ret = -ENOMEM;  	D1(printk(KERN_DEBUG "jffs2_do_fill_super(): d_alloc_root()\n")); -	sb->s_root = d_alloc_root(root_i); +	sb->s_root = d_make_root(root_i);  	if (!sb->s_root) -		goto out_root_i; +		goto out_root;  	sb->s_maxbytes = 0xFFFFFFFF;  	sb->s_blocksize = PAGE_CACHE_SIZE; @@ -573,8 +573,6 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)  		jffs2_start_garbage_collect_thread(c);  	return 0; - out_root_i: -	iput(root_i);  out_root:  	jffs2_free_ino_caches(c);  	jffs2_free_raw_node_refs(c); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 5f7c160ea64..07c91ca6017 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -220,12 +220,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)  	dquot_initialize(dip); -	/* link count overflow on parent directory ? */ -	if (dip->i_nlink == JFS_LINK_MAX) { -		rc = -EMLINK; -		goto out1; -	} -  	/*  	 * search parent directory for entry/freespace  	 * (dtSearch() returns parent directory page pinned) @@ -806,9 +800,6 @@ static int jfs_link(struct dentry *old_dentry,  	jfs_info("jfs_link: %s %s", old_dentry->d_name.name,  		 dentry->d_name.name); -	if (ip->i_nlink == JFS_LINK_MAX) -		return -EMLINK; -  	dquot_initialize(dir);  	tid = txBegin(ip->i_sb, 0); @@ -1138,10 +1129,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,  				rc = -ENOTEMPTY;  				goto out3;  			} -		} else if ((new_dir != old_dir) && -			   (new_dir->i_nlink == JFS_LINK_MAX)) { -			rc = -EMLINK; -			goto out3;  		}  	} else if (new_ip) {  		IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 682bca642f3..4a82950f412 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -441,6 +441,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)  		return -ENOMEM;  	sb->s_fs_info = sbi; +	sb->s_max_links = JFS_LINK_MAX;  	sbi->sb = sb;  	sbi->uid = sbi->gid = sbi->umask = -1; @@ -521,7 +522,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)  		ret = PTR_ERR(inode);  		goto out_no_rw;  	} -	sb->s_root = d_alloc_root(inode); +	sb->s_root = d_make_root(inode);  	if (!sb->s_root)  		goto out_no_root; @@ -539,7 +540,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)  out_no_root:  	jfs_err("jfs_read_super: get root dentry failed"); -	iput(inode);  out_no_rw:  	rc = jfs_umount(sb); @@ -860,8 +860,14 @@ static int __init init_jfs_fs(void)  	jfs_proc_init();  #endif -	return register_filesystem(&jfs_fs_type); +	rc = register_filesystem(&jfs_fs_type); +	if (!rc) +		return 0; +#ifdef PROC_FS_JFS +	jfs_proc_clean(); +#endif +	kthread_stop(jfsSyncThread);  kill_committask:  	for (i = 0; i < commit_threads; i++)  		kthread_stop(jfsCommitThread[i]); diff --git a/fs/libfs.c b/fs/libfs.c index 5b2dbb3ba4f..4a0d1f06da5 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -3,7 +3,7 @@   *	Library for filesystems writers.   */ -#include <linux/module.h> +#include <linux/export.h>  #include <linux/pagemap.h>  #include <linux/slab.h>  #include <linux/mount.h> @@ -491,11 +491,9 @@ int simple_fill_super(struct super_block *s, unsigned long magic,  	inode->i_op = &simple_dir_inode_operations;  	inode->i_fop = &simple_dir_operations;  	set_nlink(inode, 2); -	root = d_alloc_root(inode); -	if (!root) { -		iput(inode); +	root = d_make_root(inode); +	if (!root)  		return -ENOMEM; -	}  	for (i = 0; !files->name || files->name[0]; i++, files++) {  		if (!files->name)  			continue; @@ -536,7 +534,7 @@ int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *c  	spin_lock(&pin_fs_lock);  	if (unlikely(!*mount)) {  		spin_unlock(&pin_fs_lock); -		mnt = vfs_kern_mount(type, 0, type->name, NULL); +		mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL);  		if (IS_ERR(mnt))  			return PTR_ERR(mnt);  		spin_lock(&pin_fs_lock); diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index f848b52c67b..3ddcbb1c0a4 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -598,7 +598,7 @@ static struct rpc_procinfo	nlm4_procedures[] = {  	PROC(GRANTED_RES,	res,		norep),  }; -struct rpc_version	nlm_version4 = { +const struct rpc_version nlm_version4 = {  	.number		= 4,  	.nrprocs	= ARRAY_SIZE(nlm4_procedures),  	.procs		= nlm4_procedures, diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 8d4ea8351e3..ba1dc2eebd1 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -62,7 +62,8 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)  	host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,  				   nlm_init->protocol, nlm_version, -				   nlm_init->hostname, nlm_init->noresvport); +				   nlm_init->hostname, nlm_init->noresvport, +				   nlm_init->net);  	if (host == NULL) {  		lockd_down();  		return ERR_PTR(-ENOLCK); diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 180ac34feb9..3d35e3e80c1 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -596,19 +596,19 @@ static struct rpc_procinfo	nlm_procedures[] = {  	PROC(GRANTED_RES,	res,		norep),  }; -static struct rpc_version	nlm_version1 = { +static const struct rpc_version	nlm_version1 = {  		.number		= 1,  		.nrprocs	= ARRAY_SIZE(nlm_procedures),  		.procs		= nlm_procedures,  }; -static struct rpc_version	nlm_version3 = { +static const struct rpc_version	nlm_version3 = {  		.number		= 3,  		.nrprocs	= ARRAY_SIZE(nlm_procedures),  		.procs		= nlm_procedures,  }; -static struct rpc_version	*nlm_versions[] = { +static const struct rpc_version	*nlm_versions[] = {  	[1] = &nlm_version1,  	[3] = &nlm_version3,  #ifdef CONFIG_LOCKD_V4 @@ -618,7 +618,7 @@ static struct rpc_version	*nlm_versions[] = {  static struct rpc_stat		nlm_rpc_stats; -struct rpc_program		nlm_program = { +const struct rpc_program	nlm_program = {  		.name		= "lockd",  		.number		= NLM_PROGRAM,  		.nrvers		= ARRAY_SIZE(nlm_versions), diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 6f29836ec0c..eb75ca7c2d6 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -17,6 +17,8 @@  #include <linux/lockd/lockd.h>  #include <linux/mutex.h> +#include <linux/sunrpc/svc_xprt.h> +  #include <net/ipv6.h>  #define NLMDBG_FACILITY		NLMDBG_HOSTCACHE @@ -54,6 +56,7 @@ struct nlm_lookup_host_info {  	const char		*hostname;	/* remote's hostname */  	const size_t		hostname_len;	/* it's length */  	const int		noresvport;	/* use non-priv port */ +	struct net		*net;		/* network namespace to bind */  };  /* @@ -155,6 +158,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,  	INIT_LIST_HEAD(&host->h_reclaim);  	host->h_nsmhandle  = nsm;  	host->h_addrbuf    = nsm->sm_addrbuf; +	host->net	   = ni->net;  out:  	return host; @@ -206,7 +210,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,  				     const unsigned short protocol,  				     const u32 version,  				     const char *hostname, -				     int noresvport) +				     int noresvport, +				     struct net *net)  {  	struct nlm_lookup_host_info ni = {  		.server		= 0, @@ -217,6 +222,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,  		.hostname	= hostname,  		.hostname_len	= strlen(hostname),  		.noresvport	= noresvport, +		.net		= net,  	};  	struct hlist_head *chain;  	struct hlist_node *pos; @@ -231,6 +237,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,  	chain = &nlm_client_hosts[nlm_hash_address(sap)];  	hlist_for_each_entry(host, pos, chain, h_hash) { +		if (host->net != net) +			continue;  		if (!rpc_cmp_addr(nlm_addr(host), sap))  			continue; @@ -318,6 +326,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,  	struct nsm_handle *nsm = NULL;  	struct sockaddr *src_sap = svc_daddr(rqstp);  	size_t src_len = rqstp->rq_daddrlen; +	struct net *net = rqstp->rq_xprt->xpt_net;  	struct nlm_lookup_host_info ni = {  		.server		= 1,  		.sap		= svc_addr(rqstp), @@ -326,6 +335,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,  		.version	= rqstp->rq_vers,  		.hostname	= hostname,  		.hostname_len	= hostname_len, +		.net		= net,  	};  	dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, @@ -339,6 +349,8 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,  	chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];  	hlist_for_each_entry(host, pos, chain, h_hash) { +		if (host->net != net) +			continue;  		if (!rpc_cmp_addr(nlm_addr(host), ni.sap))  			continue; @@ -431,7 +443,7 @@ nlm_bind_host(struct nlm_host *host)  			.to_retries	= 5U,  		};  		struct rpc_create_args args = { -			.net		= &init_net, +			.net		= host->net,  			.protocol	= host->h_proto,  			.address	= nlm_addr(host),  			.addrsize	= host->h_addrlen, @@ -553,12 +565,8 @@ void nlm_host_rebooted(const struct nlm_reboot *info)  	nsm_release(nsm);  } -/* - * Shut down the hosts module. - * Note that this routine is called only at server shutdown time. - */  void -nlm_shutdown_hosts(void) +nlm_shutdown_hosts_net(struct net *net)  {  	struct hlist_head *chain;  	struct hlist_node *pos; @@ -570,6 +578,8 @@ nlm_shutdown_hosts(void)  	/* First, make all hosts eligible for gc */  	dprintk("lockd: nuking all hosts...\n");  	for_each_host(host, pos, chain, nlm_server_hosts) { +		if (net && host->net != net) +			continue;  		host->h_expires = jiffies - 1;  		if (host->h_rpcclnt) {  			rpc_shutdown_client(host->h_rpcclnt); @@ -580,15 +590,29 @@ nlm_shutdown_hosts(void)  	/* Then, perform a garbage collection pass */  	nlm_gc_hosts();  	mutex_unlock(&nlm_host_mutex); +} + +/* + * Shut down the hosts module. + * Note that this routine is called only at server shutdown time. + */ +void +nlm_shutdown_hosts(void) +{ +	struct hlist_head *chain; +	struct hlist_node *pos; +	struct nlm_host	*host; + +	nlm_shutdown_hosts_net(NULL);  	/* complain if any hosts are left */  	if (nrhosts != 0) {  		printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");  		dprintk("lockd: %lu hosts left:\n", nrhosts);  		for_each_host(host, pos, chain, nlm_server_hosts) { -			dprintk("       %s (cnt %d use %d exp %ld)\n", +			dprintk("       %s (cnt %d use %d exp %ld net %p)\n",  				host->h_name, atomic_read(&host->h_count), -				host->h_inuse, host->h_expires); +				host->h_inuse, host->h_expires, host->net);  		}  	}  } diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 65ba36b80a9..7ef14b3c5be 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -47,7 +47,7 @@ struct nsm_res {  	u32			state;  }; -static struct rpc_program	nsm_program; +static const struct rpc_program	nsm_program;  static				LIST_HEAD(nsm_handles);  static				DEFINE_SPINLOCK(nsm_lock); @@ -62,14 +62,14 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)  	return (struct sockaddr *)&nsm->sm_addr;  } -static struct rpc_clnt *nsm_create(void) +static struct rpc_clnt *nsm_create(struct net *net)  {  	struct sockaddr_in sin = {  		.sin_family		= AF_INET,  		.sin_addr.s_addr	= htonl(INADDR_LOOPBACK),  	};  	struct rpc_create_args args = { -		.net			= &init_net, +		.net			= net,  		.protocol		= XPRT_TRANSPORT_UDP,  		.address		= (struct sockaddr *)&sin,  		.addrsize		= sizeof(sin), @@ -83,7 +83,8 @@ static struct rpc_clnt *nsm_create(void)  	return rpc_create(&args);  } -static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) +static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, +			 struct net *net)  {  	struct rpc_clnt	*clnt;  	int		status; @@ -99,7 +100,7 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)  		.rpc_resp	= res,  	}; -	clnt = nsm_create(); +	clnt = nsm_create(net);  	if (IS_ERR(clnt)) {  		status = PTR_ERR(clnt);  		dprintk("lockd: failed to create NSM upcall transport, " @@ -149,7 +150,7 @@ int nsm_monitor(const struct nlm_host *host)  	 */  	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; -	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res); +	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host->net);  	if (unlikely(res.status != 0))  		status = -EIO;  	if (unlikely(status < 0)) { @@ -183,7 +184,7 @@ void nsm_unmonitor(const struct nlm_host *host)  	 && nsm->sm_monitored && !nsm->sm_sticky) {  		dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); -		status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res); +		status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host->net);  		if (res.status != 0)  			status = -EIO;  		if (status < 0) @@ -534,19 +535,19 @@ static struct rpc_procinfo	nsm_procedures[] = {  	},  }; -static struct rpc_version	nsm_version1 = { +static const struct rpc_version nsm_version1 = {  		.number		= 1,  		.nrprocs	= ARRAY_SIZE(nsm_procedures),  		.procs		= nsm_procedures  }; -static struct rpc_version *	nsm_version[] = { +static const struct rpc_version *nsm_version[] = {  	[1] = &nsm_version1,  };  static struct rpc_stat		nsm_stats; -static struct rpc_program	nsm_program = { +static const struct rpc_program nsm_program = {  		.name		= "statd",  		.number		= NSM_PROGRAM,  		.nrvers		= ARRAY_SIZE(nsm_version), diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h new file mode 100644 index 00000000000..ce227e0fbc5 --- /dev/null +++ b/fs/lockd/netns.h @@ -0,0 +1,12 @@ +#ifndef __LOCKD_NETNS_H__ +#define __LOCKD_NETNS_H__ + +#include <net/netns/generic.h> + +struct lockd_net { +	unsigned int nlmsvc_users; +}; + +extern int lockd_net_id; + +#endif diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index c061b9aa7dd..2774e1013b3 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -35,6 +35,8 @@  #include <linux/lockd/lockd.h>  #include <linux/nfs.h> +#include "netns.h" +  #define NLMDBG_FACILITY		NLMDBG_SVC  #define LOCKD_BUFSIZE		(1024 + NLMSVC_XDRSIZE)  #define ALLOWED_SIGS		(sigmask(SIGKILL)) @@ -50,6 +52,8 @@ static struct task_struct	*nlmsvc_task;  static struct svc_rqst		*nlmsvc_rqst;  unsigned long			nlmsvc_timeout; +int lockd_net_id; +  /*   * These can be set at insmod time (useful for NFS as root filesystem),   * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003 @@ -189,27 +193,29 @@ lockd(void *vrqstp)  }  static int create_lockd_listener(struct svc_serv *serv, const char *name, -				 const int family, const unsigned short port) +				 struct net *net, const int family, +				 const unsigned short port)  {  	struct svc_xprt *xprt; -	xprt = svc_find_xprt(serv, name, family, 0); +	xprt = svc_find_xprt(serv, name, net, family, 0);  	if (xprt == NULL) -		return svc_create_xprt(serv, name, &init_net, family, port, +		return svc_create_xprt(serv, name, net, family, port,  						SVC_SOCK_DEFAULTS);  	svc_xprt_put(xprt);  	return 0;  } -static int create_lockd_family(struct svc_serv *serv, const int family) +static int create_lockd_family(struct svc_serv *serv, struct net *net, +			       const int family)  {  	int err; -	err = create_lockd_listener(serv, "udp", family, nlm_udpport); +	err = create_lockd_listener(serv, "udp", net, family, nlm_udpport);  	if (err < 0)  		return err; -	return create_lockd_listener(serv, "tcp", family, nlm_tcpport); +	return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport);  }  /* @@ -222,16 +228,16 @@ static int create_lockd_family(struct svc_serv *serv, const int family)   * Returns zero if all listeners are available; otherwise a   * negative errno value is returned.   */ -static int make_socks(struct svc_serv *serv) +static int make_socks(struct svc_serv *serv, struct net *net)  {  	static int warned;  	int err; -	err = create_lockd_family(serv, PF_INET); +	err = create_lockd_family(serv, net, PF_INET);  	if (err < 0)  		goto out_err; -	err = create_lockd_family(serv, PF_INET6); +	err = create_lockd_family(serv, net, PF_INET6);  	if (err < 0 && err != -EAFNOSUPPORT)  		goto out_err; @@ -245,6 +251,47 @@ out_err:  	return err;  } +static int lockd_up_net(struct net *net) +{ +	struct lockd_net *ln = net_generic(net, lockd_net_id); +	struct svc_serv *serv = nlmsvc_rqst->rq_server; +	int error; + +	if (ln->nlmsvc_users) +		return 0; + +	error = svc_rpcb_setup(serv, net); +	if (error) +		goto err_rpcb; + +	error = make_socks(serv, net); +	if (error < 0) +		goto err_socks; +	return 0; + +err_socks: +	svc_rpcb_cleanup(serv, net); +err_rpcb: +	return error; +} + +static void lockd_down_net(struct net *net) +{ +	struct lockd_net *ln = net_generic(net, lockd_net_id); +	struct svc_serv *serv = nlmsvc_rqst->rq_server; + +	if (ln->nlmsvc_users) { +		if (--ln->nlmsvc_users == 0) { +			nlm_shutdown_hosts_net(net); +			svc_shutdown_net(serv, net); +		} +	} else { +		printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n", +				nlmsvc_task, net); +		BUG(); +	} +} +  /*   * Bring up the lockd process if it's not already up.   */ @@ -252,13 +299,16 @@ int lockd_up(void)  {  	struct svc_serv *serv;  	int		error = 0; +	struct net *net = current->nsproxy->net_ns;  	mutex_lock(&nlmsvc_mutex);  	/*  	 * Check whether we're already up and running.  	 */ -	if (nlmsvc_rqst) +	if (nlmsvc_rqst) { +		error = lockd_up_net(net);  		goto out; +	}  	/*  	 * Sanity check: if there's no pid, @@ -275,7 +325,7 @@ int lockd_up(void)  		goto out;  	} -	error = make_socks(serv); +	error = make_socks(serv, net);  	if (error < 0)  		goto destroy_and_out; @@ -313,8 +363,12 @@ int lockd_up(void)  destroy_and_out:  	svc_destroy(serv);  out: -	if (!error) +	if (!error) { +		struct lockd_net *ln = net_generic(net, lockd_net_id); + +		ln->nlmsvc_users++;  		nlmsvc_users++; +	}  	mutex_unlock(&nlmsvc_mutex);  	return error;  } @@ -328,8 +382,10 @@ lockd_down(void)  {  	mutex_lock(&nlmsvc_mutex);  	if (nlmsvc_users) { -		if (--nlmsvc_users) +		if (--nlmsvc_users) { +			lockd_down_net(current->nsproxy->net_ns);  			goto out; +		}  	} else {  		printk(KERN_ERR "lockd_down: no users! task=%p\n",  			nlmsvc_task); @@ -497,24 +553,55 @@ module_param_call(nlm_tcpport, param_set_port, param_get_int,  module_param(nsm_use_hostnames, bool, 0644);  module_param(nlm_max_connections, uint, 0644); +static int lockd_init_net(struct net *net) +{ +	return 0; +} + +static void lockd_exit_net(struct net *net) +{ +} + +static struct pernet_operations lockd_net_ops = { +	.init = lockd_init_net, +	.exit = lockd_exit_net, +	.id = &lockd_net_id, +	.size = sizeof(struct lockd_net), +}; + +  /*   * Initialising and terminating the module.   */  static int __init init_nlm(void)  { +	int err; +  #ifdef CONFIG_SYSCTL +	err = -ENOMEM;  	nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); -	return nlm_sysctl_table ? 0 : -ENOMEM; -#else +	if (nlm_sysctl_table == NULL) +		goto err_sysctl; +#endif +	err = register_pernet_subsys(&lockd_net_ops); +	if (err) +		goto err_pernet;  	return 0; + +err_pernet: +#ifdef CONFIG_SYSCTL +	unregister_sysctl_table(nlm_sysctl_table);  #endif +err_sysctl: +	return err;  }  static void __exit exit_nlm(void)  {  	/* FIXME: delete all NLM clients */  	nlm_shutdown_hosts(); +	unregister_pernet_subsys(&lockd_net_ops);  #ifdef CONFIG_SYSCTL  	unregister_sysctl_table(nlm_sysctl_table);  #endif diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index f0179c3745d..e46353f41a4 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -46,7 +46,6 @@ static void	nlmsvc_remove_block(struct nlm_block *block);  static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);  static void nlmsvc_freegrantargs(struct nlm_rqst *call);  static const struct rpc_call_ops nlmsvc_grant_ops; -static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);  /*   * The list of blocked locks to retry @@ -54,6 +53,35 @@ static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie);  static LIST_HEAD(nlm_blocked);  static DEFINE_SPINLOCK(nlm_blocked_lock); +#ifdef LOCKD_DEBUG +static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) +{ +	/* +	 * We can get away with a static buffer because we're only +	 * called with BKL held. +	 */ +	static char buf[2*NLM_MAXCOOKIELEN+1]; +	unsigned int i, len = sizeof(buf); +	char *p = buf; + +	len--;	/* allow for trailing \0 */ +	if (len < 3) +		return "???"; +	for (i = 0 ; i < cookie->len ; i++) { +		if (len < 2) { +			strcpy(p-3, "..."); +			break; +		} +		sprintf(p, "%02x", cookie->data[i]); +		p += 2; +		len -= 2; +	} +	*p = '\0'; + +	return buf; +} +#endif +  /*   * Insert a blocked lock into the global list   */ @@ -935,32 +963,3 @@ nlmsvc_retry_blocked(void)  	return timeout;  } - -#ifdef RPC_DEBUG -static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) -{ -	/* -	 * We can get away with a static buffer because we're only -	 * called with BKL held. -	 */ -	static char buf[2*NLM_MAXCOOKIELEN+1]; -	unsigned int i, len = sizeof(buf); -	char *p = buf; - -	len--;	/* allow for trailing \0 */ -	if (len < 3) -		return "???"; -	for (i = 0 ; i < cookie->len ; i++) { -		if (len < 2) { -			strcpy(p-3, "..."); -			break; -		} -		sprintf(p, "%02x", cookie->data[i]); -		p += 2; -		len -= 2; -	} -	*p = '\0'; - -	return buf; -} -#endif diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c index e97404d611e..9c501449450 100644 --- a/fs/logfs/dev_mtd.c +++ b/fs/logfs/dev_mtd.c @@ -152,9 +152,6 @@ static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)  	filler_t *filler = logfs_mtd_readpage;  	struct mtd_info *mtd = super->s_mtd; -	if (!mtd_can_have_bb(mtd)) -		return NULL; -  	*ofs = 0;  	while (mtd_block_isbad(mtd, *ofs)) {  		*ofs += mtd->erasesize; @@ -172,9 +169,6 @@ static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)  	filler_t *filler = logfs_mtd_readpage;  	struct mtd_info *mtd = super->s_mtd; -	if (!mtd_can_have_bb(mtd)) -		return NULL; -  	*ofs = mtd->size - mtd->erasesize;  	while (mtd_block_isbad(mtd, *ofs)) {  		*ofs -= mtd->erasesize; diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 501043e8966..bea5d1b9954 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -71,7 +71,7 @@ static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,  static int write_inode(struct inode *inode)  { -	return __logfs_write_inode(inode, WF_LOCK); +	return __logfs_write_inode(inode, NULL, WF_LOCK);  }  static s64 dir_seek_data(struct inode *inode, s64 pos) @@ -177,17 +177,17 @@ static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)  				(filler_t *)logfs_readpage, NULL);  		if (IS_ERR(page))  			return page; -		dd = kmap_atomic(page, KM_USER0); +		dd = kmap_atomic(page);  		BUG_ON(dd->namelen == 0);  		if (name->len != be16_to_cpu(dd->namelen) ||  				memcmp(name->name, dd->name, name->len)) { -			kunmap_atomic(dd, KM_USER0); +			kunmap_atomic(dd);  			page_cache_release(page);  			continue;  		} -		kunmap_atomic(dd, KM_USER0); +		kunmap_atomic(dd);  		return page;  	}  	return NULL; @@ -365,9 +365,9 @@ static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,  		return NULL;  	}  	index = page->index; -	dd = kmap_atomic(page, KM_USER0); +	dd = kmap_atomic(page);  	ino = be64_to_cpu(dd->ino); -	kunmap_atomic(dd, KM_USER0); +	kunmap_atomic(dd);  	page_cache_release(page);  	inode = logfs_iget(dir->i_sb, ino); @@ -402,12 +402,12 @@ static int logfs_write_dir(struct inode *dir, struct dentry *dentry,  		if (!page)  			return -ENOMEM; -		dd = kmap_atomic(page, KM_USER0); +		dd = kmap_atomic(page);  		memset(dd, 0, sizeof(*dd));  		dd->ino = cpu_to_be64(inode->i_ino);  		dd->type = logfs_type(inode);  		logfs_set_name(dd, &dentry->d_name); -		kunmap_atomic(dd, KM_USER0); +		kunmap_atomic(dd);  		err = logfs_write_buf(dir, page, WF_LOCK);  		unlock_page(page); @@ -558,9 +558,6 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir,  {  	struct inode *inode = old_dentry->d_inode; -	if (inode->i_nlink >= LOGFS_LINK_MAX) -		return -EMLINK; -  	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;  	ihold(inode);  	inc_nlink(inode); @@ -579,9 +576,9 @@ static int logfs_get_dd(struct inode *dir, struct dentry *dentry,  	if (IS_ERR(page))  		return PTR_ERR(page);  	*pos = page->index; -	map = kmap_atomic(page, KM_USER0); +	map = kmap_atomic(page);  	memcpy(dd, map, sizeof(*dd)); -	kunmap_atomic(map, KM_USER0); +	kunmap_atomic(map);  	page_cache_release(page);  	return 0;  } diff --git a/fs/logfs/file.c b/fs/logfs/file.c index b548c87a86f..3886cded283 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -230,7 +230,9 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)  		return ret;  	mutex_lock(&inode->i_mutex); +	logfs_get_wblocks(sb, NULL, WF_LOCK);  	logfs_write_anchor(sb); +	logfs_put_wblocks(sb, NULL, WF_LOCK);  	mutex_unlock(&inode->i_mutex);  	return 0; diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c index caa4419285d..d4efb061bdc 100644 --- a/fs/logfs/gc.c +++ b/fs/logfs/gc.c @@ -367,7 +367,7 @@ static struct gc_candidate *get_candidate(struct super_block *sb)  	int i, max_dist;  	struct gc_candidate *cand = NULL, *this; -	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); +	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);  	for (i = max_dist; i >= 0; i--) {  		this = first_in_list(&super->s_low_list[i]); diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 388df1aa35e..a422f42238b 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -286,7 +286,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)  	if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)  		return 0; -	ret = __logfs_write_inode(inode, flags); +	ret = __logfs_write_inode(inode, NULL, flags);  	LOGFS_BUG_ON(ret, inode->i_sb);  	return ret;  } @@ -363,7 +363,9 @@ static void logfs_init_once(void *_li)  static int logfs_sync_fs(struct super_block *sb, int wait)  { +	logfs_get_wblocks(sb, NULL, WF_LOCK);  	logfs_write_anchor(sb); +	logfs_put_wblocks(sb, NULL, WF_LOCK);  	return 0;  } diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 9da29706f91..1e1c369df22 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -612,7 +612,6 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,  	if (len == 0)  		return logfs_write_header(super, header, 0, type); -	BUG_ON(len > sb->s_blocksize);  	compr_len = logfs_compress(buf, data, len, sb->s_blocksize);  	if (compr_len < 0 || type == JE_ANCHOR) {  		memcpy(data, buf, len); diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 926373866a5..5f093760946 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -528,7 +528,7 @@ void logfs_destroy_inode_cache(void);  void logfs_set_blocks(struct inode *inode, u64 no);  /* these logically belong into inode.c but actually reside in readwrite.c */  int logfs_read_inode(struct inode *inode); -int __logfs_write_inode(struct inode *inode, long flags); +int __logfs_write_inode(struct inode *inode, struct page *, long flags);  void logfs_evict_inode(struct inode *inode);  /* journal.c */ @@ -577,6 +577,8 @@ void initialize_block_counters(struct page *page, struct logfs_block *block,  		__be64 *array, int page_is_empty);  int logfs_exist_block(struct inode *inode, u64 bix);  int get_page_reserve(struct inode *inode, struct page *page); +void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock); +void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);  extern struct logfs_block_ops indirect_block_ops;  /* segment.c */ @@ -594,6 +596,7 @@ int logfs_init_mapping(struct super_block *sb);  void logfs_sync_area(struct logfs_area *area);  void logfs_sync_segments(struct super_block *sb);  void freeseg(struct super_block *sb, u32 segno); +void free_areas(struct super_block *sb);  /* area handling */  int logfs_init_areas(struct super_block *sb); diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 2ac4217b790..e3ab5e5a904 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -244,8 +244,7 @@ static void preunlock_page(struct super_block *sb, struct page *page, int lock)   * is waiting for s_write_mutex.  We annotate this fact by setting PG_pre_locked   * in addition to PG_locked.   */ -static void logfs_get_wblocks(struct super_block *sb, struct page *page, -		int lock) +void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)  {  	struct logfs_super *super = logfs_super(sb); @@ -260,8 +259,7 @@ static void logfs_get_wblocks(struct super_block *sb, struct page *page,  	}  } -static void logfs_put_wblocks(struct super_block *sb, struct page *page, -		int lock) +void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)  {  	struct logfs_super *super = logfs_super(sb); @@ -424,7 +422,7 @@ static void inode_write_block(struct logfs_block *block)  	if (inode->i_ino == LOGFS_INO_MASTER)  		logfs_write_anchor(inode->i_sb);  	else { -		ret = __logfs_write_inode(inode, 0); +		ret = __logfs_write_inode(inode, NULL, 0);  		/* see indirect_write_block comment */  		BUG_ON(ret);  	} @@ -519,9 +517,9 @@ static int indirect_write_alias(struct super_block *sb,  		ino = page->mapping->host->i_ino;  		logfs_unpack_index(page->index, &bix, &level); -		child = kmap_atomic(page, KM_USER0); +		child = kmap_atomic(page);  		val = child[pos]; -		kunmap_atomic(child, KM_USER0); +		kunmap_atomic(child);  		err = write_one_alias(sb, ino, bix, level, pos, val);  		if (err)  			return err; @@ -560,8 +558,13 @@ static void inode_free_block(struct super_block *sb, struct logfs_block *block)  static void indirect_free_block(struct super_block *sb,  		struct logfs_block *block)  { -	ClearPagePrivate(block->page); -	block->page->private = 0; +	struct page *page = block->page; + +	if (PagePrivate(page)) { +		ClearPagePrivate(page); +		page_cache_release(page); +		set_page_private(page, 0); +	}  	__free_block(sb, block);  } @@ -650,8 +653,11 @@ static void alloc_data_block(struct inode *inode, struct page *page)  	logfs_unpack_index(page->index, &bix, &level);  	block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);  	block->page = page; +  	SetPagePrivate(page); -	page->private = (unsigned long)block; +	page_cache_get(page); +	set_page_private(page, (unsigned long) block); +  	block->ops = &indirect_block_ops;  } @@ -667,9 +673,9 @@ static void alloc_indirect_block(struct inode *inode, struct page *page,  	alloc_data_block(inode, page);  	block = logfs_block(page); -	array = kmap_atomic(page, KM_USER0); +	array = kmap_atomic(page);  	initialize_block_counters(page, block, array, page_is_empty); -	kunmap_atomic(array, KM_USER0); +	kunmap_atomic(array);  }  static void block_set_pointer(struct page *page, int index, u64 ptr) @@ -679,10 +685,10 @@ static void block_set_pointer(struct page *page, int index, u64 ptr)  	u64 oldptr;  	BUG_ON(!block); -	array = kmap_atomic(page, KM_USER0); +	array = kmap_atomic(page);  	oldptr = be64_to_cpu(array[index]);  	array[index] = cpu_to_be64(ptr); -	kunmap_atomic(array, KM_USER0); +	kunmap_atomic(array);  	SetPageUptodate(page);  	block->full += !!(ptr & LOGFS_FULLY_POPULATED) @@ -695,9 +701,9 @@ static u64 block_get_pointer(struct page *page, int index)  	__be64 *block;  	u64 ptr; -	block = kmap_atomic(page, KM_USER0); +	block = kmap_atomic(page);  	ptr = be64_to_cpu(block[index]); -	kunmap_atomic(block, KM_USER0); +	kunmap_atomic(block);  	return ptr;  } @@ -844,7 +850,7 @@ static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)  		}  		slot = get_bits(bix, SUBLEVEL(level)); -		rblock = kmap_atomic(page, KM_USER0); +		rblock = kmap_atomic(page);  		while (slot < LOGFS_BLOCK_FACTOR) {  			if (data && (rblock[slot] != 0))  				break; @@ -855,12 +861,12 @@ static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)  			bix &= ~(increment - 1);  		}  		if (slot >= LOGFS_BLOCK_FACTOR) { -			kunmap_atomic(rblock, KM_USER0); +			kunmap_atomic(rblock);  			logfs_put_read_page(page);  			return bix;  		}  		bofs = be64_to_cpu(rblock[slot]); -		kunmap_atomic(rblock, KM_USER0); +		kunmap_atomic(rblock);  		logfs_put_read_page(page);  		if (!bofs) {  			BUG_ON(data); @@ -1570,11 +1576,15 @@ int logfs_write_buf(struct inode *inode, struct page *page, long flags)  static int __logfs_delete(struct inode *inode, struct page *page)  {  	long flags = WF_DELETE; +	int err;  	inode->i_ctime = inode->i_mtime = CURRENT_TIME;  	if (page->index < I0_BLOCKS)  		return logfs_write_direct(inode, page, flags); +	err = grow_inode(inode, page->index, 0); +	if (err) +		return err;  	return logfs_write_rec(inode, page, page->index, 0, flags);  } @@ -1623,7 +1633,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,  			if (inode->i_ino == LOGFS_INO_MASTER)  				logfs_write_anchor(inode->i_sb);  			else { -				err = __logfs_write_inode(inode, flags); +				err = __logfs_write_inode(inode, page, flags);  			}  		}  	} @@ -1873,7 +1883,7 @@ int logfs_truncate(struct inode *inode, u64 target)  		logfs_get_wblocks(sb, NULL, 1);  		err = __logfs_truncate(inode, size);  		if (!err) -			err = __logfs_write_inode(inode, 0); +			err = __logfs_write_inode(inode, NULL, 0);  		logfs_put_wblocks(sb, NULL, 1);  	} @@ -1901,8 +1911,11 @@ static void move_page_to_inode(struct inode *inode, struct page *page)  	li->li_block = block;  	block->page = NULL; -	page->private = 0; -	ClearPagePrivate(page); +	if (PagePrivate(page)) { +		ClearPagePrivate(page); +		page_cache_release(page); +		set_page_private(page, 0); +	}  }  static void move_inode_to_page(struct page *page, struct inode *inode) @@ -1918,8 +1931,12 @@ static void move_inode_to_page(struct page *page, struct inode *inode)  	BUG_ON(PagePrivate(page));  	block->ops = &indirect_block_ops;  	block->page = page; -	page->private = (unsigned long)block; -	SetPagePrivate(page); + +	if (!PagePrivate(page)) { +		SetPagePrivate(page); +		page_cache_get(page); +		set_page_private(page, (unsigned long) block); +	}  	block->inode = NULL;  	li->li_block = NULL; @@ -1944,9 +1961,9 @@ int logfs_read_inode(struct inode *inode)  	if (IS_ERR(page))  		return PTR_ERR(page); -	di = kmap_atomic(page, KM_USER0); +	di = kmap_atomic(page);  	logfs_disk_to_inode(di, inode); -	kunmap_atomic(di, KM_USER0); +	kunmap_atomic(di);  	move_page_to_inode(inode, page);  	page_cache_release(page);  	return 0; @@ -1965,9 +1982,9 @@ static struct page *inode_to_page(struct inode *inode)  	if (!page)  		return NULL; -	di = kmap_atomic(page, KM_USER0); +	di = kmap_atomic(page);  	logfs_inode_to_disk(inode, di); -	kunmap_atomic(di, KM_USER0); +	kunmap_atomic(di);  	move_inode_to_page(page, inode);  	return page;  } @@ -2024,13 +2041,13 @@ static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,  	if (write)  		alloc_indirect_block(inode, page, 0); -	se = kmap_atomic(page, KM_USER0); +	se = kmap_atomic(page);  	change_se(se + child_no, arg);  	if (write) {  		logfs_set_alias(sb, logfs_block(page), child_no);  		BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);  	} -	kunmap_atomic(se, KM_USER0); +	kunmap_atomic(se);  	logfs_put_write_page(page);  } @@ -2106,14 +2123,14 @@ void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)  			ec_level);  } -int __logfs_write_inode(struct inode *inode, long flags) +int __logfs_write_inode(struct inode *inode, struct page *page, long flags)  {  	struct super_block *sb = inode->i_sb;  	int ret; -	logfs_get_wblocks(sb, NULL, flags & WF_LOCK); +	logfs_get_wblocks(sb, page, flags & WF_LOCK);  	ret = do_write_inode(inode); -	logfs_put_wblocks(sb, NULL, flags & WF_LOCK); +	logfs_put_wblocks(sb, page, flags & WF_LOCK);  	return ret;  } @@ -2228,10 +2245,10 @@ int logfs_inode_write(struct inode *inode, const void *buf, size_t count,  	if (!page)  		return -ENOMEM; -	pagebuf = kmap_atomic(page, KM_USER0); +	pagebuf = kmap_atomic(page);  	memcpy(pagebuf, buf, count);  	flush_dcache_page(page); -	kunmap_atomic(pagebuf, KM_USER0); +	kunmap_atomic(pagebuf);  	if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)  		i_size_write(inode, pos + LOGFS_BLOCKSIZE); diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 9d518735325..e28d090c98d 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -86,7 +86,11 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,  		BUG_ON(!page); /* FIXME: reserve a pool */  		SetPageUptodate(page);  		memcpy(page_address(page) + offset, buf, copylen); -		SetPagePrivate(page); + +		if (!PagePrivate(page)) { +			SetPagePrivate(page); +			page_cache_get(page); +		}  		page_cache_release(page);  		buf += copylen; @@ -110,7 +114,10 @@ static void pad_partial_page(struct logfs_area *area)  		page = get_mapping_page(sb, index, 0);  		BUG_ON(!page); /* FIXME: reserve a pool */  		memset(page_address(page) + offset, 0xff, len); -		SetPagePrivate(page); +		if (!PagePrivate(page)) { +			SetPagePrivate(page); +			page_cache_get(page); +		}  		page_cache_release(page);  	}  } @@ -130,7 +137,10 @@ static void pad_full_pages(struct logfs_area *area)  		BUG_ON(!page); /* FIXME: reserve a pool */  		SetPageUptodate(page);  		memset(page_address(page), 0xff, PAGE_CACHE_SIZE); -		SetPagePrivate(page); +		if (!PagePrivate(page)) { +			SetPagePrivate(page); +			page_cache_get(page); +		}  		page_cache_release(page);  		index++;  		no_indizes--; @@ -485,8 +495,12 @@ static void move_btree_to_page(struct inode *inode, struct page *page,  		mempool_free(item, super->s_alias_pool);  	}  	block->page = page; -	SetPagePrivate(page); -	page->private = (unsigned long)block; + +	if (!PagePrivate(page)) { +		SetPagePrivate(page); +		page_cache_get(page); +		set_page_private(page, (unsigned long) block); +	}  	block->ops = &indirect_block_ops;  	initialize_block_counters(page, block, data, 0);  } @@ -529,15 +543,19 @@ void move_page_to_btree(struct page *page)  		BUG_ON(!item); /* mempool empty */  		memset(item, 0, sizeof(*item)); -		child = kmap_atomic(page, KM_USER0); +		child = kmap_atomic(page);  		item->val = child[pos]; -		kunmap_atomic(child, KM_USER0); +		kunmap_atomic(child);  		item->child_no = pos;  		list_add(&item->list, &block->item_list);  	}  	block->page = NULL; -	ClearPagePrivate(page); -	page->private = 0; + +	if (PagePrivate(page)) { +		ClearPagePrivate(page); +		page_cache_release(page); +		set_page_private(page, 0); +	}  	block->ops = &btree_block_ops;  	err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,  			block); @@ -702,7 +720,10 @@ void freeseg(struct super_block *sb, u32 segno)  		page = find_get_page(mapping, ofs >> PAGE_SHIFT);  		if (!page)  			continue; -		ClearPagePrivate(page); +		if (PagePrivate(page)) { +			ClearPagePrivate(page); +			page_cache_release(page); +		}  		page_cache_release(page);  	}  } @@ -841,6 +862,16 @@ static void free_area(struct logfs_area *area)  	kfree(area);  } +void free_areas(struct super_block *sb) +{ +	struct logfs_super *super = logfs_super(sb); +	int i; + +	for_each_area(i) +		free_area(super->s_area[i]); +	free_area(super->s_journal_area); +} +  static struct logfs_area *alloc_area(struct super_block *sb)  {  	struct logfs_area *area; @@ -923,10 +954,6 @@ err:  void logfs_cleanup_areas(struct super_block *sb)  {  	struct logfs_super *super = logfs_super(sb); -	int i;  	btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias); -	for_each_area(i) -		free_area(super->s_area[i]); -	free_area(super->s_journal_area);  } diff --git a/fs/logfs/super.c b/fs/logfs/super.c index e795c234ea3..97bca623d89 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -315,11 +315,9 @@ static int logfs_get_sb_final(struct super_block *sb)  	if (IS_ERR(rootdir))  		goto fail; -	sb->s_root = d_alloc_root(rootdir); -	if (!sb->s_root) { -		iput(rootdir); +	sb->s_root = d_make_root(rootdir); +	if (!sb->s_root)  		goto fail; -	}  	/* at that point we know that ->put_super() will be called */  	super->s_erase_page = alloc_pages(GFP_KERNEL, 0); @@ -486,14 +484,15 @@ static void logfs_kill_sb(struct super_block *sb)  	/* Alias entries slow down mount, so evict as many as possible */  	sync_filesystem(sb);  	logfs_write_anchor(sb); +	free_areas(sb);  	/*  	 * From this point on alias entries are simply dropped - and any  	 * writes to the object store are considered bugs.  	 */ -	super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;  	log_super("LogFS: Now in shutdown\n");  	generic_shutdown_super(sb); +	super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;  	BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes); @@ -541,6 +540,7 @@ static struct dentry *logfs_get_sb_device(struct logfs_super *super,  	 * the filesystem incompatible with 32bit systems.  	 */  	sb->s_maxbytes	= (1ull << 43) - 1; +	sb->s_max_links = LOGFS_LINK_MAX;  	sb->s_op	= &logfs_super_operations;  	sb->s_flags	= flags | MS_NOATIME; @@ -626,7 +626,10 @@ static int __init logfs_init(void)  	if (ret)  		goto out2; -	return register_filesystem(&logfs_fs_type); +	ret = register_filesystem(&logfs_fs_type); +	if (!ret) +		return 0; +	logfs_destroy_inode_cache();  out2:  	logfs_compr_exit();  out1: diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 085a9262c69..685b2d981b8 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -335,7 +335,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)  		goto fail;  	} -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	memset(kaddr, 0, PAGE_CACHE_SIZE);  	if (sbi->s_version == MINIX_V3) { @@ -355,7 +355,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)  		de->inode = dir->i_ino;  		strcpy(de->name, "..");  	} -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize);  fail: diff --git a/fs/minix/inode.c b/fs/minix/inode.c index fa8b612b8ce..fcb05d2c6b5 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -190,24 +190,24 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)  		sbi->s_version = MINIX_V1;  		sbi->s_dirsize = 16;  		sbi->s_namelen = 14; -		sbi->s_link_max = MINIX_LINK_MAX; +		s->s_max_links = MINIX_LINK_MAX;  	} else if (s->s_magic == MINIX_SUPER_MAGIC2) {  		sbi->s_version = MINIX_V1;  		sbi->s_dirsize = 32;  		sbi->s_namelen = 30; -		sbi->s_link_max = MINIX_LINK_MAX; +		s->s_max_links = MINIX_LINK_MAX;  	} else if (s->s_magic == MINIX2_SUPER_MAGIC) {  		sbi->s_version = MINIX_V2;  		sbi->s_nzones = ms->s_zones;  		sbi->s_dirsize = 16;  		sbi->s_namelen = 14; -		sbi->s_link_max = MINIX2_LINK_MAX; +		s->s_max_links = MINIX2_LINK_MAX;  	} else if (s->s_magic == MINIX2_SUPER_MAGIC2) {  		sbi->s_version = MINIX_V2;  		sbi->s_nzones = ms->s_zones;  		sbi->s_dirsize = 32;  		sbi->s_namelen = 30; -		sbi->s_link_max = MINIX2_LINK_MAX; +		s->s_max_links = MINIX2_LINK_MAX;  	} else if ( *(__u16 *)(bh->b_data + 24) == MINIX3_SUPER_MAGIC) {  		m3s = (struct minix3_super_block *) bh->b_data;  		s->s_magic = m3s->s_magic; @@ -221,9 +221,9 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)  		sbi->s_dirsize = 64;  		sbi->s_namelen = 60;  		sbi->s_version = MINIX_V3; -		sbi->s_link_max = MINIX2_LINK_MAX;  		sbi->s_mount_state = MINIX_VALID_FS;  		sb_set_blocksize(s, m3s->s_blocksize); +		s->s_max_links = MINIX2_LINK_MAX;  	} else  		goto out_no_fs; @@ -254,14 +254,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)  	minix_set_bit(0,sbi->s_imap[0]->b_data);  	minix_set_bit(0,sbi->s_zmap[0]->b_data); -	/* set up enough so that it can read an inode */ -	s->s_op = &minix_sops; -	root_inode = minix_iget(s, MINIX_ROOT_INO); -	if (IS_ERR(root_inode)) { -		ret = PTR_ERR(root_inode); -		goto out_no_root; -	} -  	/* Apparently minix can create filesystems that allocate more blocks for  	 * the bitmaps than needed.  We simply ignore that, but verify it didn't  	 * create one with not enough blocks and bail out if so. @@ -270,7 +262,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)  	if (sbi->s_imap_blocks < block) {  		printk("MINIX-fs: file system does not have enough "  				"imap blocks allocated.  Refusing to mount\n"); -		goto out_iput; +		goto out_no_bitmap;  	}  	block = minix_blocks_needed( @@ -279,13 +271,21 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)  	if (sbi->s_zmap_blocks < block) {  		printk("MINIX-fs: file system does not have enough "  				"zmap blocks allocated.  Refusing to mount.\n"); -		goto out_iput; +		goto out_no_bitmap; +	} + +	/* set up enough so that it can read an inode */ +	s->s_op = &minix_sops; +	root_inode = minix_iget(s, MINIX_ROOT_INO); +	if (IS_ERR(root_inode)) { +		ret = PTR_ERR(root_inode); +		goto out_no_root;  	}  	ret = -ENOMEM; -	s->s_root = d_alloc_root(root_inode); +	s->s_root = d_make_root(root_inode);  	if (!s->s_root) -		goto out_iput; +		goto out_no_root;  	if (!(s->s_flags & MS_RDONLY)) {  		if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ @@ -301,10 +301,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)  	return 0; -out_iput: -	iput(root_inode); -	goto out_freemap; -  out_no_root:  	if (!silent)  		printk("MINIX-fs: get root inode failed\n"); diff --git a/fs/minix/minix.h b/fs/minix/minix.h index c889ef0aa57..1ebd1185462 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -34,7 +34,6 @@ struct minix_sb_info {  	unsigned long s_max_size;  	int s_dirsize;  	int s_namelen; -	int s_link_max;  	struct buffer_head ** s_imap;  	struct buffer_head ** s_zmap;  	struct buffer_head * s_sbh; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 2f76e38c206..2d0ee178630 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -94,9 +94,6 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,  {  	struct inode *inode = old_dentry->d_inode; -	if (inode->i_nlink >= minix_sb(inode->i_sb)->s_link_max) -		return -EMLINK; -  	inode->i_ctime = CURRENT_TIME_SEC;  	inode_inc_link_count(inode);  	ihold(inode); @@ -106,10 +103,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,  static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)  {  	struct inode * inode; -	int err = -EMLINK; - -	if (dir->i_nlink >= minix_sb(dir->i_sb)->s_link_max) -		goto out; +	int err;  	inode_inc_link_count(dir); @@ -181,7 +175,6 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)  static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,  			   struct inode * new_dir, struct dentry *new_dentry)  { -	struct minix_sb_info * info = minix_sb(old_dir->i_sb);  	struct inode * old_inode = old_dentry->d_inode;  	struct inode * new_inode = new_dentry->d_inode;  	struct page * dir_page = NULL; @@ -219,11 +212,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,  			drop_nlink(new_inode);  		inode_dec_link_count(new_inode);  	} else { -		if (dir_de) { -			err = -EMLINK; -			if (new_dir->i_nlink >= info->s_link_max) -				goto out_dir; -		}  		err = minix_add_link(new_dentry, old_inode);  		if (err)  			goto out_dir; diff --git a/fs/mpage.c b/fs/mpage.c index 643e9f55ef2..0face1c4d4c 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -13,7 +13,7 @@   */  #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/mm.h>  #include <linux/kdev_t.h>  #include <linux/gfp.h> diff --git a/fs/namei.c b/fs/namei.c index 208c6aa4a98..e615ff37e27 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -15,7 +15,7 @@   */  #include <linux/init.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/slab.h>  #include <linux/fs.h>  #include <linux/namei.h> @@ -161,7 +161,7 @@ static char *getname_flags(const char __user *filename, int flags, int *empty)  char *getname(const char __user * filename)  { -	return getname_flags(filename, 0, 0); +	return getname_flags(filename, 0, NULL);  }  #ifdef CONFIG_AUDITSYSCALL @@ -642,7 +642,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p)  	cond_resched();  	current->total_link_count++; -	touch_atime(link->mnt, dentry); +	touch_atime(link);  	nd_set_link(nd, NULL);  	error = security_inode_follow_link(link->dentry, nd); @@ -1095,8 +1095,10 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr  	struct dentry *old;  	/* Don't create child dentry for a dead directory. */ -	if (unlikely(IS_DEADDIR(inode))) +	if (unlikely(IS_DEADDIR(inode))) { +		dput(dentry);  		return ERR_PTR(-ENOENT); +	}  	old = inode->i_op->lookup(inode, dentry, nd);  	if (unlikely(old)) { @@ -1373,6 +1375,157 @@ static inline int can_lookup(struct inode *inode)  }  /* + * We can do the critical dentry name comparison and hashing + * operations one word at a time, but we are limited to: + * + * - Architectures with fast unaligned word accesses. We could + *   do a "get_unaligned()" if this helps and is sufficiently + *   fast. + * + * - Little-endian machines (so that we can generate the mask + *   of low bytes efficiently). Again, we *could* do a byte + *   swapping load on big-endian architectures if that is not + *   expensive enough to make the optimization worthless. + * + * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we + *   do not trap on the (extremely unlikely) case of a page + *   crossing operation. + * + * - Furthermore, we need an efficient 64-bit compile for the + *   64-bit case in order to generate the "number of bytes in + *   the final mask". Again, that could be replaced with a + *   efficient population count instruction or similar. + */ +#ifdef CONFIG_DCACHE_WORD_ACCESS + +#ifdef CONFIG_64BIT + +/* + * Jan Achrenius on G+: microoptimized version of + * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" + * that works for the bytemasks without having to + * mask them first. + */ +static inline long count_masked_bytes(unsigned long mask) +{ +	return mask*0x0001020304050608ul >> 56; +} + +static inline unsigned int fold_hash(unsigned long hash) +{ +	hash += hash >> (8*sizeof(int)); +	return hash; +} + +#else	/* 32-bit case */ + +/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ +static inline long count_masked_bytes(long mask) +{ +	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ +	long a = (0x0ff0001+mask) >> 23; +	/* Fix the 1 for 00 case */ +	return a & mask; +} + +#define fold_hash(x) (x) + +#endif + +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ +	unsigned long a, mask; +	unsigned long hash = 0; + +	for (;;) { +		a = *(unsigned long *)name; +		if (len < sizeof(unsigned long)) +			break; +		hash += a; +		hash *= 9; +		name += sizeof(unsigned long); +		len -= sizeof(unsigned long); +		if (!len) +			goto done; +	} +	mask = ~(~0ul << len*8); +	hash += mask & a; +done: +	return fold_hash(hash); +} +EXPORT_SYMBOL(full_name_hash); + +#define REPEAT_BYTE(x)	((~0ul / 0xff) * (x)) +#define ONEBYTES	REPEAT_BYTE(0x01) +#define SLASHBYTES	REPEAT_BYTE('/') +#define HIGHBITS	REPEAT_BYTE(0x80) + +/* Return the high bit set in the first byte that is a zero */ +static inline unsigned long has_zero(unsigned long a) +{ +	return ((a - ONEBYTES) & ~a) & HIGHBITS; +} + +/* + * Calculate the length and hash of the path component, and + * return the length of the component; + */ +static inline unsigned long hash_name(const char *name, unsigned int *hashp) +{ +	unsigned long a, mask, hash, len; + +	hash = a = 0; +	len = -sizeof(unsigned long); +	do { +		hash = (hash + a) * 9; +		len += sizeof(unsigned long); +		a = *(unsigned long *)(name+len); +		/* Do we have any NUL or '/' bytes in this word? */ +		mask = has_zero(a) | has_zero(a ^ SLASHBYTES); +	} while (!mask); + +	/* The mask *below* the first high bit set */ +	mask = (mask - 1) & ~mask; +	mask >>= 7; +	hash += a & mask; +	*hashp = fold_hash(hash); + +	return len + count_masked_bytes(mask); +} + +#else + +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ +	unsigned long hash = init_name_hash(); +	while (len--) +		hash = partial_name_hash(*name++, hash); +	return end_name_hash(hash); +} +EXPORT_SYMBOL(full_name_hash); + +/* + * We know there's a real path component here of at least + * one character. + */ +static inline unsigned long hash_name(const char *name, unsigned int *hashp) +{ +	unsigned long hash = init_name_hash(); +	unsigned long len = 0, c; + +	c = (unsigned char)*name; +	do { +		len++; +		hash = partial_name_hash(c, hash); +		c = (unsigned char)name[len]; +	} while (c && c != '/'); +	*hashp = end_name_hash(hash); +	return len; +} + +#endif + +/*   * Name resolution.   * This is the basic name resolution function, turning a pathname into   * the final dentry. We expect 'base' to be positive and a directory. @@ -1392,31 +1545,22 @@ static int link_path_walk(const char *name, struct nameidata *nd)  	/* At this point we know we have a real path component. */  	for(;;) { -		unsigned long hash;  		struct qstr this; -		unsigned int c; +		long len;  		int type;  		err = may_lookup(nd);   		if (err)  			break; +		len = hash_name(name, &this.hash);  		this.name = name; -		c = *(const unsigned char *)name; - -		hash = init_name_hash(); -		do { -			name++; -			hash = partial_name_hash(c, hash); -			c = *(const unsigned char *)name; -		} while (c && (c != '/')); -		this.len = name - (const char *) this.name; -		this.hash = end_name_hash(hash); +		this.len = len;  		type = LAST_NORM; -		if (this.name[0] == '.') switch (this.len) { +		if (name[0] == '.') switch (len) {  			case 2: -				if (this.name[1] == '.') { +				if (name[1] == '.') {  					type = LAST_DOTDOT;  					nd->flags |= LOOKUP_JUMPED;  				} @@ -1435,12 +1579,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)  			}  		} -		/* remove trailing slashes? */ -		if (!c) +		if (!name[len])  			goto last_component; -		while (*++name == '/'); -		if (!*name) +		/* +		 * If it wasn't NUL, we know it was '/'. Skip that +		 * slash, and continue until no more slashes. +		 */ +		do { +			len++; +		} while (unlikely(name[len] == '/')); +		if (!name[len])  			goto last_component; +		name += len;  		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);  		if (err < 0) @@ -1773,24 +1923,21 @@ static struct dentry *lookup_hash(struct nameidata *nd)  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)  {  	struct qstr this; -	unsigned long hash;  	unsigned int c;  	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));  	this.name = name;  	this.len = len; +	this.hash = full_name_hash(name, len);  	if (!len)  		return ERR_PTR(-EACCES); -	hash = init_name_hash();  	while (len--) {  		c = *(const unsigned char *)name++;  		if (c == '/' || c == '\0')  			return ERR_PTR(-EACCES); -		hash = partial_name_hash(c, hash);  	} -	this.hash = end_name_hash(hash);  	/*  	 * See if the low-level filesystem might want  	 * to use its own hash.. @@ -1825,7 +1972,7 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,  int user_path_at(int dfd, const char __user *name, unsigned flags,  		 struct path *path)  { -	return user_path_at_empty(dfd, name, flags, path, 0); +	return user_path_at_empty(dfd, name, flags, path, NULL);  }  static int user_path_parent(int dfd, const char __user *path, @@ -2138,7 +2285,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,  		/* sayonara */  		error = complete_walk(nd);  		if (error) -			return ERR_PTR(-ECHILD); +			return ERR_PTR(error);  		error = -ENOTDIR;  		if (nd->flags & LOOKUP_DIRECTORY) { @@ -2237,7 +2384,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,  	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */  	error = complete_walk(nd);  	if (error) -		goto exit; +		return ERR_PTR(error);  	error = -EISDIR;  	if (S_ISDIR(nd->inode->i_mode))  		goto exit; @@ -2545,6 +2692,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d  int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  {  	int error = may_create(dir, dentry); +	unsigned max_links = dir->i_sb->s_max_links;  	if (error)  		return error; @@ -2557,6 +2705,9 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  	if (error)  		return error; +	if (max_links && dir->i_nlink >= max_links) +		return -EMLINK; +  	error = dir->i_op->mkdir(dir, dentry, mode);  	if (!error)  		fsnotify_mkdir(dir, dentry); @@ -2887,6 +3038,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn  int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)  {  	struct inode *inode = old_dentry->d_inode; +	unsigned max_links = dir->i_sb->s_max_links;  	int error;  	if (!inode) @@ -2917,6 +3069,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de  	/* Make sure we don't allow creating hardlink to an unlinked file */  	if (inode->i_nlink == 0)  		error =  -ENOENT; +	else if (max_links && inode->i_nlink >= max_links) +		error = -EMLINK;  	else  		error = dir->i_op->link(old_dentry, dir, new_dentry);  	mutex_unlock(&inode->i_mutex); @@ -3026,6 +3180,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,  {  	int error = 0;  	struct inode *target = new_dentry->d_inode; +	unsigned max_links = new_dir->i_sb->s_max_links;  	/*  	 * If we are going to change the parent - check write permissions, @@ -3049,6 +3204,11 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,  	if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))  		goto out; +	error = -EMLINK; +	if (max_links && !target && new_dir != old_dir && +	    new_dir->i_nlink >= max_links) +		goto out; +  	if (target)  		shrink_dcache_parent(new_dentry);  	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -3347,9 +3507,9 @@ retry:  	if (err)  		goto fail; -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	memcpy(kaddr, symname, len-1); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,  							page, fsdata); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 3d1e34f8a68..49df0e7f837 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -716,13 +716,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)          if (!root_inode)  		goto out_disconnect;  	DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber); -	sb->s_root = d_alloc_root(root_inode); +	sb->s_root = d_make_root(root_inode);          if (!sb->s_root) -		goto out_no_root; +		goto out_disconnect;  	return 0; -out_no_root: -	iput(root_inode);  out_disconnect:  	ncp_lock_server(server);  	ncp_disconnect(server); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index dbcd82126ae..2a0e6c59914 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -64,6 +64,7 @@ config NFS_V4  	bool "NFS client support for NFS version 4"  	depends on NFS_FS  	select SUNRPC_GSS +	select KEYS  	help  	  This option enables support for version 4 of the NFS protocol  	  (RFC 3530) in the kernel's NFS client. @@ -98,6 +99,18 @@ config PNFS_OBJLAYOUT  	depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD  	default m +config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN +	string "NFSv4.1 Implementation ID Domain" +	depends on NFS_V4_1 +	default "kernel.org" +	help +	  This option defines the domain portion of the implementation ID that +	  may be sent in the NFS exchange_id operation.  The value must be in +	  the format of a DNS domain name and should be set to the DNS domain +	  name of the distribution. +	  If the NFS client is unchanged from the upstream kernel, this +	  option should be set to the default "kernel.org". +  config ROOT_NFS  	bool "Root file system on NFS"  	depends on NFS_FS=y && IP_PNP @@ -130,16 +143,10 @@ config NFS_USE_KERNEL_DNS  	bool  	depends on NFS_V4 && !NFS_USE_LEGACY_DNS  	select DNS_RESOLVER -	select KEYS  	default y -config NFS_USE_NEW_IDMAPPER -	bool "Use the new idmapper upcall routine" -	depends on NFS_V4 && KEYS -	help -	  Say Y here if you want NFS to use the new idmapper upcall functions. -	  You will need /sbin/request-key (usually provided by the keyutils -	  package).  For details, read -	  <file:Documentation/filesystems/nfs/idmapper.txt>. - -	  If you are unsure, say N. +config NFS_DEBUG +	bool +	depends on NFS_FS && SUNRPC_DEBUG +	select CRC32 +	default y diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 48cfac31f64..9c94297bb70 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -46,9 +46,6 @@ MODULE_LICENSE("GPL");  MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");  MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); -struct dentry *bl_device_pipe; -wait_queue_head_t bl_wq; -  static void print_page(struct page *page)  {  	dprintk("PRINTPAGE page %p\n", page); @@ -236,12 +233,11 @@ bl_read_pagelist(struct nfs_read_data *rdata)  	sector_t isect, extent_length = 0;  	struct parallel_io *par;  	loff_t f_offset = rdata->args.offset; -	size_t count = rdata->args.count;  	struct page **pages = rdata->args.pages;  	int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; -	dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, -	       rdata->npages, f_offset, count); +	dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, +	       rdata->npages, f_offset, (unsigned int)rdata->args.count);  	par = alloc_parallel(rdata);  	if (!par) @@ -1025,10 +1021,128 @@ static const struct rpc_pipe_ops bl_upcall_ops = {  	.destroy_msg	= bl_pipe_destroy_msg,  }; +static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, +					    struct rpc_pipe *pipe) +{ +	struct dentry *dir, *dentry; + +	dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); +	if (dir == NULL) +		return ERR_PTR(-ENOENT); +	dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); +	dput(dir); +	return dentry; +} + +static void nfs4blocklayout_unregister_sb(struct super_block *sb, +					  struct rpc_pipe *pipe) +{ +	if (pipe->dentry) +		rpc_unlink(pipe->dentry); +} + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, +			   void *ptr) +{ +	struct super_block *sb = ptr; +	struct net *net = sb->s_fs_info; +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	struct dentry *dentry; +	int ret = 0; + +	if (!try_module_get(THIS_MODULE)) +		return 0; + +	if (nn->bl_device_pipe == NULL) { +		module_put(THIS_MODULE); +		return 0; +	} + +	switch (event) { +	case RPC_PIPEFS_MOUNT: +		dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); +		if (IS_ERR(dentry)) { +			ret = PTR_ERR(dentry); +			break; +		} +		nn->bl_device_pipe->dentry = dentry; +		break; +	case RPC_PIPEFS_UMOUNT: +		if (nn->bl_device_pipe->dentry) +			nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); +		break; +	default: +		ret = -ENOTSUPP; +		break; +	} +	module_put(THIS_MODULE); +	return ret; +} + +static struct notifier_block nfs4blocklayout_block = { +	.notifier_call = rpc_pipefs_event, +}; + +static struct dentry *nfs4blocklayout_register_net(struct net *net, +						   struct rpc_pipe *pipe) +{ +	struct super_block *pipefs_sb; +	struct dentry *dentry; + +	pipefs_sb = rpc_get_sb_net(net); +	if (!pipefs_sb) +		return NULL; +	dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); +	rpc_put_sb_net(net); +	return dentry; +} + +static void nfs4blocklayout_unregister_net(struct net *net, +					   struct rpc_pipe *pipe) +{ +	struct super_block *pipefs_sb; + +	pipefs_sb = rpc_get_sb_net(net); +	if (pipefs_sb) { +		nfs4blocklayout_unregister_sb(pipefs_sb, pipe); +		rpc_put_sb_net(net); +	} +} + +static int nfs4blocklayout_net_init(struct net *net) +{ +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	struct dentry *dentry; + +	init_waitqueue_head(&nn->bl_wq); +	nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); +	if (IS_ERR(nn->bl_device_pipe)) +		return PTR_ERR(nn->bl_device_pipe); +	dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); +	if (IS_ERR(dentry)) { +		rpc_destroy_pipe_data(nn->bl_device_pipe); +		return PTR_ERR(dentry); +	} +	nn->bl_device_pipe->dentry = dentry; +	return 0; +} + +static void nfs4blocklayout_net_exit(struct net *net) +{ +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); +	rpc_destroy_pipe_data(nn->bl_device_pipe); +	nn->bl_device_pipe = NULL; +} + +static struct pernet_operations nfs4blocklayout_net_ops = { +	.init = nfs4blocklayout_net_init, +	.exit = nfs4blocklayout_net_exit, +}; +  static int __init nfs4blocklayout_init(void)  { -	struct vfsmount *mnt; -	struct path path;  	int ret;  	dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); @@ -1037,32 +1151,17 @@ static int __init nfs4blocklayout_init(void)  	if (ret)  		goto out; -	init_waitqueue_head(&bl_wq); - -	mnt = rpc_get_mount(); -	if (IS_ERR(mnt)) { -		ret = PTR_ERR(mnt); +	ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); +	if (ret)  		goto out_remove; -	} - -	ret = vfs_path_lookup(mnt->mnt_root, -			      mnt, -			      NFS_PIPE_DIRNAME, 0, &path); +	ret = register_pernet_subsys(&nfs4blocklayout_net_ops);  	if (ret) -		goto out_putrpc; - -	bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, -				    &bl_upcall_ops, 0); -	path_put(&path); -	if (IS_ERR(bl_device_pipe)) { -		ret = PTR_ERR(bl_device_pipe); -		goto out_putrpc; -	} +		goto out_notifier;  out:  	return ret; -out_putrpc: -	rpc_put_mount(); +out_notifier: +	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);  out_remove:  	pnfs_unregister_layoutdriver(&blocklayout_type);  	return ret; @@ -1073,9 +1172,9 @@ static void __exit nfs4blocklayout_exit(void)  	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",  	       __func__); +	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); +	unregister_pernet_subsys(&nfs4blocklayout_net_ops);  	pnfs_unregister_layoutdriver(&blocklayout_type); -	rpc_unlink(bl_device_pipe); -	rpc_put_mount();  }  MODULE_ALIAS("nfs-layouttype4-3"); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index e31a2df28e7..03350690118 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -37,6 +37,7 @@  #include <linux/sunrpc/rpc_pipe_fs.h>  #include "../pnfs.h" +#include "../netns.h"  #define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)  #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) @@ -50,6 +51,7 @@ struct pnfs_block_dev {  	struct list_head		bm_node;  	struct nfs4_deviceid		bm_mdevid;    /* associated devid */  	struct block_device		*bm_mdev;     /* meta device itself */ +	struct net			*net;  };  enum exstate4 { @@ -151,9 +153,9 @@ BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)  	return BLK_LO2EXT(lseg->pls_layout);  } -struct bl_dev_msg { -	int32_t status; -	uint32_t major, minor; +struct bl_pipe_msg { +	struct rpc_pipe_msg msg; +	wait_queue_head_t *bl_wq;  };  struct bl_msg_hdr { @@ -161,9 +163,6 @@ struct bl_msg_hdr {  	u16 totallen; /* length of entire message, including hdr itself */  }; -extern struct dentry *bl_device_pipe; -extern wait_queue_head_t bl_wq; -  #define BL_DEVICE_UMOUNT               0x0 /* Umount--delete devices */  #define BL_DEVICE_MOUNT                0x1 /* Mount--create devices*/  #define BL_DEVICE_REQUEST_INIT         0x0 /* Start request */ diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index d08ba9107fd..a5c88a554d9 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -46,7 +46,7 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)  	*rp = xdr_decode_hyper(*rp, &s);  	if (s & 0x1ff) { -		printk(KERN_WARNING "%s: sector not aligned\n", __func__); +		printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);  		return -1;  	}  	*sp = s >> SECTOR_SHIFT; @@ -79,27 +79,30 @@ int nfs4_blkdev_put(struct block_device *bdev)  	return blkdev_put(bdev, FMODE_READ);  } -static struct bl_dev_msg bl_mount_reply; -  ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,  			 size_t mlen)  { +	struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, +					 nfs_net_id); +  	if (mlen != sizeof (struct bl_dev_msg))  		return -EINVAL; -	if (copy_from_user(&bl_mount_reply, src, mlen) != 0) +	if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)  		return -EFAULT; -	wake_up(&bl_wq); +	wake_up(&nn->bl_wq);  	return mlen;  }  void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)  { +	struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); +  	if (msg->errno >= 0)  		return; -	wake_up(&bl_wq); +	wake_up(bl_pipe_msg->bl_wq);  }  /* @@ -111,29 +114,33 @@ nfs4_blk_decode_device(struct nfs_server *server,  {  	struct pnfs_block_dev *rv;  	struct block_device *bd = NULL; -	struct rpc_pipe_msg msg; +	struct bl_pipe_msg bl_pipe_msg; +	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;  	struct bl_msg_hdr bl_msg = {  		.type = BL_DEVICE_MOUNT,  		.totallen = dev->mincount,  	};  	uint8_t *dataptr;  	DECLARE_WAITQUEUE(wq, current); -	struct bl_dev_msg *reply = &bl_mount_reply;  	int offset, len, i, rc; +	struct net *net = server->nfs_client->net; +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	struct bl_dev_msg *reply = &nn->bl_mount_reply;  	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);  	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,  		dev->mincount); -	memset(&msg, 0, sizeof(msg)); -	msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); -	if (!msg.data) { +	bl_pipe_msg.bl_wq = &nn->bl_wq; +	memset(msg, 0, sizeof(*msg)); +	msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); +	if (!msg->data) {  		rv = ERR_PTR(-ENOMEM);  		goto out;  	} -	memcpy(msg.data, &bl_msg, sizeof(bl_msg)); -	dataptr = (uint8_t *) msg.data; +	memcpy(msg->data, &bl_msg, sizeof(bl_msg)); +	dataptr = (uint8_t *) msg->data;  	len = dev->mincount;  	offset = sizeof(bl_msg);  	for (i = 0; len > 0; i++) { @@ -142,13 +149,13 @@ nfs4_blk_decode_device(struct nfs_server *server,  		len -= PAGE_CACHE_SIZE;  		offset += PAGE_CACHE_SIZE;  	} -	msg.len = sizeof(bl_msg) + dev->mincount; +	msg->len = sizeof(bl_msg) + dev->mincount;  	dprintk("%s CALLING USERSPACE DAEMON\n", __func__); -	add_wait_queue(&bl_wq, &wq); -	rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg); +	add_wait_queue(&nn->bl_wq, &wq); +	rc = rpc_queue_upcall(nn->bl_device_pipe, msg);  	if (rc < 0) { -		remove_wait_queue(&bl_wq, &wq); +		remove_wait_queue(&nn->bl_wq, &wq);  		rv = ERR_PTR(rc);  		goto out;  	} @@ -156,7 +163,7 @@ nfs4_blk_decode_device(struct nfs_server *server,  	set_current_state(TASK_UNINTERRUPTIBLE);  	schedule();  	__set_current_state(TASK_RUNNING); -	remove_wait_queue(&bl_wq, &wq); +	remove_wait_queue(&nn->bl_wq, &wq);  	if (reply->status != BL_DEVICE_REQUEST_PROC) {  		dprintk("%s failed to open device: %d\n", @@ -181,13 +188,14 @@ nfs4_blk_decode_device(struct nfs_server *server,  	rv->bm_mdev = bd;  	memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); +	rv->net = net;  	dprintk("%s Created device %s with bd_block_size %u\n",  		__func__,  		bd->bd_disk->disk_name,  		bd->bd_block_size);  out: -	kfree(msg.data); +	kfree(msg->data);  	return rv;  } diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index d055c755807..737d839bc17 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -38,9 +38,10 @@  #define NFSDBG_FACILITY         NFSDBG_PNFS_LD -static void dev_remove(dev_t dev) +static void dev_remove(struct net *net, dev_t dev)  { -	struct rpc_pipe_msg msg; +	struct bl_pipe_msg bl_pipe_msg; +	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;  	struct bl_dev_msg bl_umount_request;  	struct bl_msg_hdr bl_msg = {  		.type = BL_DEVICE_UMOUNT, @@ -48,36 +49,38 @@ static void dev_remove(dev_t dev)  	};  	uint8_t *dataptr;  	DECLARE_WAITQUEUE(wq, current); +	struct nfs_net *nn = net_generic(net, nfs_net_id);  	dprintk("Entering %s\n", __func__); -	memset(&msg, 0, sizeof(msg)); -	msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); -	if (!msg.data) +	bl_pipe_msg.bl_wq = &nn->bl_wq; +	memset(msg, 0, sizeof(*msg)); +	msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); +	if (!msg->data)  		goto out;  	memset(&bl_umount_request, 0, sizeof(bl_umount_request));  	bl_umount_request.major = MAJOR(dev);  	bl_umount_request.minor = MINOR(dev); -	memcpy(msg.data, &bl_msg, sizeof(bl_msg)); -	dataptr = (uint8_t *) msg.data; +	memcpy(msg->data, &bl_msg, sizeof(bl_msg)); +	dataptr = (uint8_t *) msg->data;  	memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); -	msg.len = sizeof(bl_msg) + bl_msg.totallen; +	msg->len = sizeof(bl_msg) + bl_msg.totallen; -	add_wait_queue(&bl_wq, &wq); -	if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { -		remove_wait_queue(&bl_wq, &wq); +	add_wait_queue(&nn->bl_wq, &wq); +	if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { +		remove_wait_queue(&nn->bl_wq, &wq);  		goto out;  	}  	set_current_state(TASK_UNINTERRUPTIBLE);  	schedule();  	__set_current_state(TASK_RUNNING); -	remove_wait_queue(&bl_wq, &wq); +	remove_wait_queue(&nn->bl_wq, &wq);  out: -	kfree(msg.data); +	kfree(msg->data);  }  /* @@ -90,10 +93,10 @@ static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)  	dprintk("%s Releasing\n", __func__);  	rv = nfs4_blkdev_put(bdev->bm_mdev);  	if (rv) -		printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n", +		printk(KERN_ERR "NFS: %s nfs4_blkdev_put returns %d\n",  				__func__, rv); -	dev_remove(bdev->bm_mdev->bd_dev); +	dev_remove(bdev->net, bdev->bm_mdev->bd_dev);  }  void bl_free_block_dev(struct pnfs_block_dev *bdev) diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 1abac09f7cd..1f9a6032796 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -147,7 +147,7 @@ static int _preload_range(struct pnfs_inval_markings *marks,  	count = (int)(end - start) / (int)tree->mtt_step_size;  	/* Pre-malloc what memory we might need */ -	storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); +	storage = kcalloc(count, sizeof(*storage), GFP_NOFS);  	if (!storage)  		return -ENOMEM;  	for (i = 0; i < count; i++) { diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index c98b439332f..dded2636811 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -13,6 +13,7 @@  #include <linux/slab.h>  #include <linux/sunrpc/cache.h>  #include <linux/sunrpc/rpc_pipe_fs.h> +#include <net/net_namespace.h>  #include "cache_lib.h" @@ -111,30 +112,54 @@ int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)  	return 0;  } -int nfs_cache_register(struct cache_detail *cd) +int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)  { -	struct vfsmount *mnt; -	struct path path;  	int ret; +	struct dentry *dir; -	mnt = rpc_get_mount(); -	if (IS_ERR(mnt)) -		return PTR_ERR(mnt); -	ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &path); -	if (ret) -		goto err; -	ret = sunrpc_cache_register_pipefs(path.dentry, cd->name, 0600, cd); -	path_put(&path); -	if (!ret) -		return ret; -err: -	rpc_put_mount(); +	dir = rpc_d_lookup_sb(sb, "cache"); +	BUG_ON(dir == NULL); +	ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd); +	dput(dir);  	return ret;  } -void nfs_cache_unregister(struct cache_detail *cd) +int nfs_cache_register_net(struct net *net, struct cache_detail *cd)  { -	sunrpc_cache_unregister_pipefs(cd); -	rpc_put_mount(); +	struct super_block *pipefs_sb; +	int ret = 0; + +	pipefs_sb = rpc_get_sb_net(net); +	if (pipefs_sb) { +		ret = nfs_cache_register_sb(pipefs_sb, cd); +		rpc_put_sb_net(net); +	} +	return ret; +} + +void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd) +{ +	if (cd->u.pipefs.dir) +		sunrpc_cache_unregister_pipefs(cd); +} + +void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd) +{ +	struct super_block *pipefs_sb; + +	pipefs_sb = rpc_get_sb_net(net); +	if (pipefs_sb) { +		nfs_cache_unregister_sb(pipefs_sb, cd); +		rpc_put_sb_net(net); +	} +} + +void nfs_cache_init(struct cache_detail *cd) +{ +	sunrpc_init_cache_detail(cd);  } +void nfs_cache_destroy(struct cache_detail *cd) +{ +	sunrpc_destroy_cache_detail(cd); +} diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 7cf6cafcc00..317db95e37f 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -23,5 +23,11 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);  extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);  extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); -extern int nfs_cache_register(struct cache_detail *cd); -extern void nfs_cache_unregister(struct cache_detail *cd); +extern void nfs_cache_init(struct cache_detail *cd); +extern void nfs_cache_destroy(struct cache_detail *cd); +extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd); +extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd); +extern int nfs_cache_register_sb(struct super_block *sb, +				 struct cache_detail *cd); +extern void nfs_cache_unregister_sb(struct super_block *sb, +				    struct cache_detail *cd); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 516f3375e06..eb95f5091c1 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -85,7 +85,7 @@ nfs4_callback_svc(void *vrqstp)  		}  		if (err < 0) {  			if (err != preverr) { -				printk(KERN_WARNING "%s: unexpected error " +				printk(KERN_WARNING "NFS: %s: unexpected error "  					"from svc_recv (%d)\n", __func__, err);  				preverr = err;  			} @@ -101,12 +101,12 @@ nfs4_callback_svc(void *vrqstp)  /*   * Prepare to bring up the NFSv4 callback service   */ -struct svc_rqst * -nfs4_callback_up(struct svc_serv *serv) +static struct svc_rqst * +nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)  {  	int ret; -	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET, +	ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET,  				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);  	if (ret <= 0)  		goto out_err; @@ -114,7 +114,7 @@ nfs4_callback_up(struct svc_serv *serv)  	dprintk("NFS: Callback listener port = %u (af %u)\n",  			nfs_callback_tcpport, PF_INET); -	ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6, +	ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET6,  				nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);  	if (ret > 0) {  		nfs_callback_tcpport6 = ret; @@ -172,7 +172,7 @@ nfs41_callback_svc(void *vrqstp)  /*   * Bring up the NFSv4.1 callback service   */ -struct svc_rqst * +static struct svc_rqst *  nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)  {  	struct svc_rqst *rqstp; @@ -183,7 +183,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt)  	 * fore channel connection.  	 * Returns the input port (0) and sets the svc_serv bc_xprt on success  	 */ -	ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0, +	ret = svc_create_xprt(serv, "tcp-bc", xprt->xprt_net, PF_INET, 0,  			      SVC_SOCK_ANONYMOUS);  	if (ret < 0) {  		rqstp = ERR_PTR(ret); @@ -269,7 +269,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)  					serv, xprt, &rqstp, &callback_svc);  	if (!minorversion_setup) {  		/* v4.0 callback setup */ -		rqstp = nfs4_callback_up(serv); +		rqstp = nfs4_callback_up(serv, xprt);  		callback_svc = nfs4_callback_svc;  	} @@ -332,7 +332,6 @@ void nfs_callback_down(int minorversion)  int  check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)  { -	struct rpc_clnt *r = clp->cl_rpcclient;  	char *p = svc_gss_principal(rqstp);  	if (rqstp->rq_authop->flavour != RPC_AUTH_GSS) @@ -353,7 +352,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)  	if (memcmp(p, "nfs@", 4) != 0)  		return 0;  	p += 4; -	if (strcmp(p, r->cl_server) != 0) +	if (strcmp(p, clp->cl_hostname) != 0)  		return 0;  	return 1;  } diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index c89d3b9e483..a5527c90a5a 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -38,7 +38,8 @@ enum nfs4_callback_opnum {  struct cb_process_state {  	__be32			drc_status;  	struct nfs_client	*clp; -	int			slotid; +	u32			slotid; +	struct net		*net;  };  struct cb_compound_hdr_arg { diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 54cea8ad5a7..1b5d809a105 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -8,6 +8,7 @@  #include <linux/nfs4.h>  #include <linux/nfs_fs.h>  #include <linux/slab.h> +#include <linux/rcupdate.h>  #include "nfs4_fs.h"  #include "callback.h"  #include "delegation.h" @@ -33,7 +34,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,  	res->bitmap[0] = res->bitmap[1] = 0;  	res->status = htonl(NFS4ERR_BADHANDLE); -	dprintk("NFS: GETATTR callback request from %s\n", +	dprintk_rcu("NFS: GETATTR callback request from %s\n",  		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));  	inode = nfs_delegation_find_inode(cps->clp, &args->fh); @@ -73,7 +74,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,  	if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */  		goto out; -	dprintk("NFS: RECALL callback request from %s\n", +	dprintk_rcu("NFS: RECALL callback request from %s\n",  		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));  	res = htonl(NFS4ERR_BADHANDLE); @@ -86,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,  		res = 0;  		break;  	case -ENOENT: -		if (res != 0) -			res = htonl(NFS4ERR_BAD_STATEID); +		res = htonl(NFS4ERR_BAD_STATEID);  		break;  	default:  		res = htonl(NFS4ERR_RESOURCE); @@ -98,52 +98,64 @@ out:  	return res;  } -int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) -{ -	if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, -					 sizeof(delegation->stateid.data)) != 0) -		return 0; -	return 1; -} -  #if defined(CONFIG_NFS_V4_1) -static u32 initiate_file_draining(struct nfs_client *clp, -				  struct cb_layoutrecallargs *args) +/* + * Lookup a layout by filehandle. + * + * Note: gets a refcount on the layout hdr and on its respective inode. + * Caller must put the layout hdr and the inode. + * + * TODO: keep track of all layouts (and delegations) in a hash table + * hashed by filehandle. + */ +static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh)  {  	struct nfs_server *server; -	struct pnfs_layout_hdr *lo;  	struct inode *ino; -	bool found = false; -	u32 rv = NFS4ERR_NOMATCHING_LAYOUT; -	LIST_HEAD(free_me_list); +	struct pnfs_layout_hdr *lo; -	spin_lock(&clp->cl_lock); -	rcu_read_lock();  	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {  		list_for_each_entry(lo, &server->layouts, plh_layouts) { -			if (nfs_compare_fh(&args->cbl_fh, -					   &NFS_I(lo->plh_inode)->fh)) +			if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))  				continue;  			ino = igrab(lo->plh_inode);  			if (!ino)  				continue; -			found = true; -			/* Without this, layout can be freed as soon -			 * as we release cl_lock. -			 */  			get_layout_hdr(lo); -			break; +			return lo;  		} -		if (found) -			break;  	} + +	return NULL; +} + +static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh) +{ +	struct pnfs_layout_hdr *lo; + +	spin_lock(&clp->cl_lock); +	rcu_read_lock(); +	lo = get_layout_by_fh_locked(clp, fh);  	rcu_read_unlock();  	spin_unlock(&clp->cl_lock); -	if (!found) +	return lo; +} + +static u32 initiate_file_draining(struct nfs_client *clp, +				  struct cb_layoutrecallargs *args) +{ +	struct inode *ino; +	struct pnfs_layout_hdr *lo; +	u32 rv = NFS4ERR_NOMATCHING_LAYOUT; +	LIST_HEAD(free_me_list); + +	lo = get_layout_by_fh(clp, &args->cbl_fh); +	if (!lo)  		return NFS4ERR_NOMATCHING_LAYOUT; +	ino = lo->plh_inode;  	spin_lock(&ino->i_lock);  	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||  	    mark_matching_lsegs_invalid(lo, &free_me_list, @@ -213,17 +225,13 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,  static u32 do_callback_layoutrecall(struct nfs_client *clp,  				    struct cb_layoutrecallargs *args)  { -	u32 res = NFS4ERR_DELAY; +	u32 res;  	dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); -	if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state)) -		goto out;  	if (args->cbl_recall_type == RETURN_FILE)  		res = initiate_file_draining(clp, args);  	else  		res = initiate_bulk_draining(clp, args); -	clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state); -out:  	dprintk("%s returning %i\n", __func__, res);  	return res; @@ -303,21 +311,6 @@ out:  	return res;  } -int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) -{ -	if (delegation == NULL) -		return 0; - -	if (stateid->stateid.seqid != 0) -		return 0; -	if (memcmp(&delegation->stateid.stateid.other, -		   &stateid->stateid.other, -		   NFS4_STATEID_OTHER_SIZE)) -		return 0; - -	return 1; -} -  /*   * Validate the sequenceID sent by the server.   * Return success if the sequenceID is one more than what we last saw on @@ -441,7 +434,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,  	int i;  	__be32 status = htonl(NFS4ERR_BADSESSION); -	clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); +	clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid);  	if (clp == NULL)  		goto out; @@ -517,7 +510,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,  	if (!cps->clp) /* set in cb_sequence */  		goto out; -	dprintk("NFS: RECALL_ANY callback request from %s\n", +	dprintk_rcu("NFS: RECALL_ANY callback request from %s\n",  		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));  	status = cpu_to_be32(NFS4ERR_INVAL); @@ -552,7 +545,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,  	if (!cps->clp) /* set in cb_sequence */  		goto out; -	dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", +	dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target max slots %d\n",  		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),  		args->crsa_target_max_slots); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d50b2742f23..95bfc243992 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -9,6 +9,8 @@  #include <linux/sunrpc/svc.h>  #include <linux/nfs4.h>  #include <linux/nfs_fs.h> +#include <linux/ratelimit.h> +#include <linux/printk.h>  #include <linux/slab.h>  #include <linux/sunrpc/bc_xprt.h>  #include "nfs4_fs.h" @@ -73,7 +75,7 @@ static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)  	p = xdr_inline_decode(xdr, nbytes);  	if (unlikely(p == NULL)) -		printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n"); +		printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n");  	return p;  } @@ -138,10 +140,10 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)  {  	__be32 *p; -	p = read_buf(xdr, 16); +	p = read_buf(xdr, NFS4_STATEID_SIZE);  	if (unlikely(p == NULL))  		return htonl(NFS4ERR_RESOURCE); -	memcpy(stateid->data, p, 16); +	memcpy(stateid, p, NFS4_STATEID_SIZE);  	return 0;  } @@ -155,7 +157,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound  		return status;  	/* We do not like overly long tags! */  	if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) { -		printk("NFSv4 CALLBACK %s: client sent tag of length %u\n", +		printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n",  				__func__, hdr->taglen);  		return htonl(NFS4ERR_RESOURCE);  	} @@ -167,7 +169,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound  	if (hdr->minorversion <= 1) {  		hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */  	} else { -		printk(KERN_WARNING "%s: NFSv4 server callback with " +		pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "  			"illegal minor version %u!\n",  			__func__, hdr->minorversion);  		return htonl(NFS4ERR_MINOR_VERS_MISMATCH); @@ -759,14 +761,14 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)  	 * Let the state manager know callback processing done.  	 * A single slot, so highest used slotid is either 0 or -1  	 */ -	tbl->highest_used_slotid = -1; +	tbl->highest_used_slotid = NFS4_NO_SLOT;  	nfs4_check_drain_bc_complete(session);  	spin_unlock(&tbl->slot_tbl_lock);  }  static void nfs4_cb_free_slot(struct cb_process_state *cps)  { -	if (cps->slotid != -1) +	if (cps->slotid != NFS4_NO_SLOT)  		nfs4_callback_free_slot(cps->clp->cl_session);  } @@ -860,7 +862,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r  	struct cb_process_state cps = {  		.drc_status = 0,  		.clp = NULL, -		.slotid = -1, +		.slotid = NFS4_NO_SLOT, +		.net = rqstp->rq_xprt->xpt_net,  	};  	unsigned int nops = 0; @@ -876,7 +879,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r  		return rpc_garbage_args;  	if (hdr_arg.minorversion == 0) { -		cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident); +		cps.clp = nfs4_find_client_ident(rqstp->rq_xprt->xpt_net, hdr_arg.cb_ident);  		if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))  			return rpc_drop_reply;  	} diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 31778f74357..4a108a0a2a6 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -36,9 +36,12 @@  #include <linux/inet.h>  #include <linux/in6.h>  #include <linux/slab.h> +#include <linux/idr.h>  #include <net/ipv6.h>  #include <linux/nfs_xdr.h>  #include <linux/sunrpc/bc_xprt.h> +#include <linux/nsproxy.h> +#include <linux/pid_namespace.h>  #include <asm/system.h> @@ -49,15 +52,12 @@  #include "internal.h"  #include "fscache.h"  #include "pnfs.h" +#include "netns.h"  #define NFSDBG_FACILITY		NFSDBG_CLIENT -static DEFINE_SPINLOCK(nfs_client_lock); -static LIST_HEAD(nfs_client_list); -static LIST_HEAD(nfs_volume_list);  static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);  #ifdef CONFIG_NFS_V4 -static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */  /*   * Get a unique NFSv4.0 callback identifier which will be used @@ -66,15 +66,16 @@ static DEFINE_IDR(cb_ident_idr); /* Protected by nfs_client_lock */  static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)  {  	int ret = 0; +	struct nfs_net *nn = net_generic(clp->net, nfs_net_id);  	if (clp->rpc_ops->version != 4 || minorversion != 0)  		return ret;  retry: -	if (!idr_pre_get(&cb_ident_idr, GFP_KERNEL)) +	if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL))  		return -ENOMEM; -	spin_lock(&nfs_client_lock); -	ret = idr_get_new(&cb_ident_idr, clp, &clp->cl_cb_ident); -	spin_unlock(&nfs_client_lock); +	spin_lock(&nn->nfs_client_lock); +	ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident); +	spin_unlock(&nn->nfs_client_lock);  	if (ret == -EAGAIN)  		goto retry;  	return ret; @@ -89,7 +90,7 @@ static bool nfs4_disable_idmapping = true;  /*   * RPC cruft for NFS   */ -static struct rpc_version *nfs_version[5] = { +static const struct rpc_version *nfs_version[5] = {  	[2]			= &nfs_version2,  #ifdef CONFIG_NFS_V3  	[3]			= &nfs_version3, @@ -99,7 +100,7 @@ static struct rpc_version *nfs_version[5] = {  #endif  }; -struct rpc_program nfs_program = { +const struct rpc_program nfs_program = {  	.name			= "nfs",  	.number			= NFS_PROGRAM,  	.nrvers			= ARRAY_SIZE(nfs_version), @@ -115,11 +116,11 @@ struct rpc_stat nfs_rpcstat = {  #ifdef CONFIG_NFS_V3_ACL  static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program }; -static struct rpc_version *	nfsacl_version[] = { +static const struct rpc_version *nfsacl_version[] = {  	[3]			= &nfsacl_version3,  }; -struct rpc_program		nfsacl_program = { +const struct rpc_program nfsacl_program = {  	.name			= "nfsacl",  	.number			= NFS_ACL_PROGRAM,  	.nrvers			= ARRAY_SIZE(nfsacl_version), @@ -135,6 +136,7 @@ struct nfs_client_initdata {  	const struct nfs_rpc_ops *rpc_ops;  	int proto;  	u32 minorversion; +	struct net *net;  };  /* @@ -171,6 +173,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_  	clp->cl_rpcclient = ERR_PTR(-EINVAL);  	clp->cl_proto = cl_init->proto; +	clp->net = get_net(cl_init->net);  #ifdef CONFIG_NFS_V4  	err = nfs_get_cb_ident_idr(clp, cl_init->minorversion); @@ -202,8 +205,11 @@ error_0:  #ifdef CONFIG_NFS_V4_1  static void nfs4_shutdown_session(struct nfs_client *clp)  { -	if (nfs4_has_session(clp)) +	if (nfs4_has_session(clp)) { +		nfs4_deviceid_purge_client(clp);  		nfs4_destroy_session(clp->cl_session); +	} +  }  #else /* CONFIG_NFS_V4_1 */  static void nfs4_shutdown_session(struct nfs_client *clp) @@ -233,16 +239,20 @@ static void nfs4_shutdown_client(struct nfs_client *clp)  }  /* idr_remove_all is not needed as all id's are removed by nfs_put_client */ -void nfs_cleanup_cb_ident_idr(void) +void nfs_cleanup_cb_ident_idr(struct net *net)  { -	idr_destroy(&cb_ident_idr); +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	idr_destroy(&nn->cb_ident_idr);  }  /* nfs_client_lock held */  static void nfs_cb_idr_remove_locked(struct nfs_client *clp)  { +	struct nfs_net *nn = net_generic(clp->net, nfs_net_id); +  	if (clp->cl_cb_ident) -		idr_remove(&cb_ident_idr, clp->cl_cb_ident); +		idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident);  }  static void pnfs_init_server(struct nfs_server *server) @@ -260,7 +270,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)  {  } -void nfs_cleanup_cb_ident_idr(void) +void nfs_cleanup_cb_ident_idr(struct net *net)  {  } @@ -292,10 +302,10 @@ static void nfs_free_client(struct nfs_client *clp)  	if (clp->cl_machine_cred != NULL)  		put_rpccred(clp->cl_machine_cred); -	nfs4_deviceid_purge_client(clp); - +	put_net(clp->net);  	kfree(clp->cl_hostname);  	kfree(clp->server_scope); +	kfree(clp->impl_id);  	kfree(clp);  	dprintk("<-- nfs_free_client()\n"); @@ -306,15 +316,18 @@ static void nfs_free_client(struct nfs_client *clp)   */  void nfs_put_client(struct nfs_client *clp)  { +	struct nfs_net *nn; +  	if (!clp)  		return;  	dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); +	nn = net_generic(clp->net, nfs_net_id); -	if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { +	if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) {  		list_del(&clp->cl_share_link);  		nfs_cb_idr_remove_locked(clp); -		spin_unlock(&nfs_client_lock); +		spin_unlock(&nn->nfs_client_lock);  		BUG_ON(!list_empty(&clp->cl_superblocks)); @@ -392,6 +405,7 @@ static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,  		(sin1->sin_port == sin2->sin_port);  } +#if defined(CONFIG_NFS_V4_1)  /*   * Test if two socket addresses represent the same actual socket,   * by comparing (only) relevant fields, excluding the port number. @@ -410,6 +424,7 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,  	}  	return 0;  } +#endif /* CONFIG_NFS_V4_1 */  /*   * Test if two socket addresses represent the same actual socket, @@ -430,10 +445,10 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,  	return 0;  } +#if defined(CONFIG_NFS_V4_1)  /* Common match routine for v4.0 and v4.1 callback services */ -bool -nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp, -		     u32 minorversion) +static bool nfs4_cb_match_client(const struct sockaddr *addr, +		struct nfs_client *clp, u32 minorversion)  {  	struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; @@ -453,6 +468,7 @@ nfs4_cb_match_client(const struct sockaddr *addr, struct nfs_client *clp,  	return true;  } +#endif /* CONFIG_NFS_V4_1 */  /*   * Find an nfs_client on the list that matches the initialisation data @@ -462,8 +478,9 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat  {  	struct nfs_client *clp;  	const struct sockaddr *sap = data->addr; +	struct nfs_net *nn = net_generic(data->net, nfs_net_id); -	list_for_each_entry(clp, &nfs_client_list, cl_share_link) { +	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {  	        const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;  		/* Don't match clients that failed to initialise properly */  		if (clp->cl_cons_state < 0) @@ -501,13 +518,14 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,  {  	struct nfs_client *clp, *new = NULL;  	int error; +	struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);  	dprintk("--> nfs_get_client(%s,v%u)\n",  		cl_init->hostname ?: "", cl_init->rpc_ops->version);  	/* see if the client already exists */  	do { -		spin_lock(&nfs_client_lock); +		spin_lock(&nn->nfs_client_lock);  		clp = nfs_match_client(cl_init);  		if (clp) @@ -515,7 +533,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,  		if (new)  			goto install_client; -		spin_unlock(&nfs_client_lock); +		spin_unlock(&nn->nfs_client_lock);  		new = nfs_alloc_client(cl_init);  	} while (!IS_ERR(new)); @@ -526,8 +544,8 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,  	/* install a new client and return with it unready */  install_client:  	clp = new; -	list_add(&clp->cl_share_link, &nfs_client_list); -	spin_unlock(&nfs_client_lock); +	list_add(&clp->cl_share_link, &nn->nfs_client_list); +	spin_unlock(&nn->nfs_client_lock);  	error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,  					      authflavour, noresvport); @@ -542,7 +560,7 @@ install_client:  	 * - make sure it's ready before returning  	 */  found_client: -	spin_unlock(&nfs_client_lock); +	spin_unlock(&nn->nfs_client_lock);  	if (new)  		nfs_free_client(new); @@ -642,7 +660,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,  {  	struct rpc_clnt		*clnt = NULL;  	struct rpc_create_args args = { -		.net		= &init_net, +		.net		= clp->net,  		.protocol	= clp->cl_proto,  		.address	= (struct sockaddr *)&clp->cl_addr,  		.addrsize	= clp->cl_addrlen, @@ -696,6 +714,7 @@ static int nfs_start_lockd(struct nfs_server *server)  		.nfs_version	= clp->rpc_ops->version,  		.noresvport	= server->flags & NFS_MOUNT_NORESVPORT ?  					1 : 0, +		.net		= clp->net,  	};  	if (nlm_init.nfs_version > 3) @@ -831,6 +850,7 @@ static int nfs_init_server(struct nfs_server *server,  		.addrlen = data->nfs_server.addrlen,  		.rpc_ops = &nfs_v2_clientops,  		.proto = data->nfs_server.protocol, +		.net = data->net,  	};  	struct rpc_timeout timeparms;  	struct nfs_client *clp; @@ -1029,25 +1049,30 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve  static void nfs_server_insert_lists(struct nfs_server *server)  {  	struct nfs_client *clp = server->nfs_client; +	struct nfs_net *nn = net_generic(clp->net, nfs_net_id); -	spin_lock(&nfs_client_lock); +	spin_lock(&nn->nfs_client_lock);  	list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); -	list_add_tail(&server->master_link, &nfs_volume_list); +	list_add_tail(&server->master_link, &nn->nfs_volume_list);  	clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); -	spin_unlock(&nfs_client_lock); +	spin_unlock(&nn->nfs_client_lock);  }  static void nfs_server_remove_lists(struct nfs_server *server)  {  	struct nfs_client *clp = server->nfs_client; +	struct nfs_net *nn; -	spin_lock(&nfs_client_lock); +	if (clp == NULL) +		return; +	nn = net_generic(clp->net, nfs_net_id); +	spin_lock(&nn->nfs_client_lock);  	list_del_rcu(&server->client_link); -	if (clp && list_empty(&clp->cl_superblocks)) +	if (list_empty(&clp->cl_superblocks))  		set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);  	list_del(&server->master_link); -	spin_unlock(&nfs_client_lock); +	spin_unlock(&nn->nfs_client_lock);  	synchronize_rcu();  } @@ -1086,6 +1111,8 @@ static struct nfs_server *nfs_alloc_server(void)  		return NULL;  	} +	ida_init(&server->openowner_id); +	ida_init(&server->lockowner_id);  	pnfs_init_server(server);  	return server; @@ -1111,6 +1138,8 @@ void nfs_free_server(struct nfs_server *server)  	nfs_put_client(server->nfs_client); +	ida_destroy(&server->lockowner_id); +	ida_destroy(&server->openowner_id);  	nfs_free_iostats(server->io_stats);  	bdi_destroy(&server->backing_dev_info);  	kfree(server); @@ -1189,45 +1218,19 @@ error:  /*   * NFSv4.0 callback thread helper   * - * Find a client by IP address, protocol version, and minorversion - * - * Called from the pg_authenticate method. The callback identifier - * is not used as it has not been decoded. - * - * Returns NULL if no such client - */ -struct nfs_client * -nfs4_find_client_no_ident(const struct sockaddr *addr) -{ -	struct nfs_client *clp; - -	spin_lock(&nfs_client_lock); -	list_for_each_entry(clp, &nfs_client_list, cl_share_link) { -		if (nfs4_cb_match_client(addr, clp, 0) == false) -			continue; -		atomic_inc(&clp->cl_count); -		spin_unlock(&nfs_client_lock); -		return clp; -	} -	spin_unlock(&nfs_client_lock); -	return NULL; -} - -/* - * NFSv4.0 callback thread helper - *   * Find a client by callback identifier   */  struct nfs_client * -nfs4_find_client_ident(int cb_ident) +nfs4_find_client_ident(struct net *net, int cb_ident)  {  	struct nfs_client *clp; +	struct nfs_net *nn = net_generic(net, nfs_net_id); -	spin_lock(&nfs_client_lock); -	clp = idr_find(&cb_ident_idr, cb_ident); +	spin_lock(&nn->nfs_client_lock); +	clp = idr_find(&nn->cb_ident_idr, cb_ident);  	if (clp)  		atomic_inc(&clp->cl_count); -	spin_unlock(&nfs_client_lock); +	spin_unlock(&nn->nfs_client_lock);  	return clp;  } @@ -1240,13 +1243,14 @@ nfs4_find_client_ident(int cb_ident)   * Returns NULL if no such client   */  struct nfs_client * -nfs4_find_client_sessionid(const struct sockaddr *addr, +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,  			   struct nfs4_sessionid *sid)  {  	struct nfs_client *clp; +	struct nfs_net *nn = net_generic(net, nfs_net_id); -	spin_lock(&nfs_client_lock); -	list_for_each_entry(clp, &nfs_client_list, cl_share_link) { +	spin_lock(&nn->nfs_client_lock); +	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {  		if (nfs4_cb_match_client(addr, clp, 1) == false)  			continue; @@ -1259,17 +1263,17 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,  			continue;  		atomic_inc(&clp->cl_count); -		spin_unlock(&nfs_client_lock); +		spin_unlock(&nn->nfs_client_lock);  		return clp;  	} -	spin_unlock(&nfs_client_lock); +	spin_unlock(&nn->nfs_client_lock);  	return NULL;  }  #else /* CONFIG_NFS_V4_1 */  struct nfs_client * -nfs4_find_client_sessionid(const struct sockaddr *addr, +nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,  			   struct nfs4_sessionid *sid)  {  	return NULL; @@ -1284,16 +1288,18 @@ static int nfs4_init_callback(struct nfs_client *clp)  	int error;  	if (clp->rpc_ops->version == 4) { +		struct rpc_xprt *xprt; + +		xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt); +  		if (nfs4_has_session(clp)) { -			error = xprt_setup_backchannel( -						clp->cl_rpcclient->cl_xprt, +			error = xprt_setup_backchannel(xprt,  						NFS41_BC_MIN_CALLBACKS);  			if (error < 0)  				return error;  		} -		error = nfs_callback_up(clp->cl_mvops->minor_version, -					clp->cl_rpcclient->cl_xprt); +		error = nfs_callback_up(clp->cl_mvops->minor_version, xprt);  		if (error < 0) {  			dprintk("%s: failed to start callback. Error = %d\n",  				__func__, error); @@ -1344,6 +1350,7 @@ int nfs4_init_client(struct nfs_client *clp,  		     rpc_authflavor_t authflavour,  		     int noresvport)  { +	char buf[INET6_ADDRSTRLEN + 1];  	int error;  	if (clp->cl_cons_state == NFS_CS_READY) { @@ -1359,6 +1366,20 @@ int nfs4_init_client(struct nfs_client *clp,  				      1, noresvport);  	if (error < 0)  		goto error; + +	/* If no clientaddr= option was specified, find a usable cb address */ +	if (ip_addr == NULL) { +		struct sockaddr_storage cb_addr; +		struct sockaddr *sap = (struct sockaddr *)&cb_addr; + +		error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); +		if (error < 0) +			goto error; +		error = rpc_ntop(sap, buf, sizeof(buf)); +		if (error < 0) +			goto error; +		ip_addr = (const char *)buf; +	}  	strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));  	error = nfs_idmap_new(clp); @@ -1393,7 +1414,7 @@ static int nfs4_set_client(struct nfs_server *server,  		const char *ip_addr,  		rpc_authflavor_t authflavour,  		int proto, const struct rpc_timeout *timeparms, -		u32 minorversion) +		u32 minorversion, struct net *net)  {  	struct nfs_client_initdata cl_init = {  		.hostname = hostname, @@ -1402,6 +1423,7 @@ static int nfs4_set_client(struct nfs_server *server,  		.rpc_ops = &nfs_v4_clientops,  		.proto = proto,  		.minorversion = minorversion, +		.net = net,  	};  	struct nfs_client *clp;  	int error; @@ -1453,6 +1475,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,  		.rpc_ops = &nfs_v4_clientops,  		.proto = ds_proto,  		.minorversion = mds_clp->cl_minorversion, +		.net = mds_clp->net,  	};  	struct rpc_timeout ds_timeout = {  		.to_initval = 15 * HZ, @@ -1580,7 +1603,8 @@ static int nfs4_init_server(struct nfs_server *server,  			data->auth_flavors[0],  			data->nfs_server.protocol,  			&timeparms, -			data->minorversion); +			data->minorversion, +			data->net);  	if (error < 0)  		goto error; @@ -1675,9 +1699,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,  				data->addrlen,  				parent_client->cl_ipaddr,  				data->authflavor, -				parent_server->client->cl_xprt->prot, +				rpc_protocol(parent_server->client),  				parent_server->client->cl_timeout, -				parent_client->cl_mvops->minor_version); +				parent_client->cl_mvops->minor_version, +				parent_client->net);  	if (error < 0)  		goto error; @@ -1770,6 +1795,18 @@ out_free_server:  	return ERR_PTR(error);  } +void nfs_clients_init(struct net *net) +{ +	struct nfs_net *nn = net_generic(net, nfs_net_id); + +	INIT_LIST_HEAD(&nn->nfs_client_list); +	INIT_LIST_HEAD(&nn->nfs_volume_list); +#ifdef CONFIG_NFS_V4 +	idr_init(&nn->cb_ident_idr); +#endif +	spin_lock_init(&nn->nfs_client_lock); +} +  #ifdef CONFIG_PROC_FS  static struct proc_dir_entry *proc_fs_nfs; @@ -1823,13 +1860,15 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)  {  	struct seq_file *m;  	int ret; +	struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; +	struct net *net = pid_ns->child_reaper->nsproxy->net_ns;  	ret = seq_open(file, &nfs_server_list_ops);  	if (ret < 0)  		return ret;  	m = file->private_data; -	m->private = PDE(inode)->data; +	m->private = net;  	return 0;  } @@ -1839,9 +1878,11 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)   */  static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)  { +	struct nfs_net *nn = net_generic(m->private, nfs_net_id); +  	/* lock the list against modification */ -	spin_lock(&nfs_client_lock); -	return seq_list_start_head(&nfs_client_list, *_pos); +	spin_lock(&nn->nfs_client_lock); +	return seq_list_start_head(&nn->nfs_client_list, *_pos);  }  /* @@ -1849,7 +1890,9 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)   */  static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)  { -	return seq_list_next(v, &nfs_client_list, pos); +	struct nfs_net *nn = net_generic(p->private, nfs_net_id); + +	return seq_list_next(v, &nn->nfs_client_list, pos);  }  /* @@ -1857,7 +1900,9 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)   */  static void nfs_server_list_stop(struct seq_file *p, void *v)  { -	spin_unlock(&nfs_client_lock); +	struct nfs_net *nn = net_generic(p->private, nfs_net_id); + +	spin_unlock(&nn->nfs_client_lock);  }  /* @@ -1866,9 +1911,10 @@ static void nfs_server_list_stop(struct seq_file *p, void *v)  static int nfs_server_list_show(struct seq_file *m, void *v)  {  	struct nfs_client *clp; +	struct nfs_net *nn = net_generic(m->private, nfs_net_id);  	/* display header on line 1 */ -	if (v == &nfs_client_list) { +	if (v == &nn->nfs_client_list) {  		seq_puts(m, "NV SERVER   PORT USE HOSTNAME\n");  		return 0;  	} @@ -1880,12 +1926,14 @@ static int nfs_server_list_show(struct seq_file *m, void *v)  	if (clp->cl_cons_state != NFS_CS_READY)  		return 0; +	rcu_read_lock();  	seq_printf(m, "v%u %s %s %3d %s\n",  		   clp->rpc_ops->version,  		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),  		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),  		   atomic_read(&clp->cl_count),  		   clp->cl_hostname); +	rcu_read_unlock();  	return 0;  } @@ -1897,13 +1945,15 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)  {  	struct seq_file *m;  	int ret; +	struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info; +	struct net *net = pid_ns->child_reaper->nsproxy->net_ns;  	ret = seq_open(file, &nfs_volume_list_ops);  	if (ret < 0)  		return ret;  	m = file->private_data; -	m->private = PDE(inode)->data; +	m->private = net;  	return 0;  } @@ -1913,9 +1963,11 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)   */  static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)  { +	struct nfs_net *nn = net_generic(m->private, nfs_net_id); +  	/* lock the list against modification */ -	spin_lock(&nfs_client_lock); -	return seq_list_start_head(&nfs_volume_list, *_pos); +	spin_lock(&nn->nfs_client_lock); +	return seq_list_start_head(&nn->nfs_volume_list, *_pos);  }  /* @@ -1923,7 +1975,9 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)   */  static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)  { -	return seq_list_next(v, &nfs_volume_list, pos); +	struct nfs_net *nn = net_generic(p->private, nfs_net_id); + +	return seq_list_next(v, &nn->nfs_volume_list, pos);  }  /* @@ -1931,7 +1985,9 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)   */  static void nfs_volume_list_stop(struct seq_file *p, void *v)  { -	spin_unlock(&nfs_client_lock); +	struct nfs_net *nn = net_generic(p->private, nfs_net_id); + +	spin_unlock(&nn->nfs_client_lock);  }  /* @@ -1942,9 +1998,10 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)  	struct nfs_server *server;  	struct nfs_client *clp;  	char dev[8], fsid[17]; +	struct nfs_net *nn = net_generic(m->private, nfs_net_id);  	/* display header on line 1 */ -	if (v == &nfs_volume_list) { +	if (v == &nn->nfs_volume_list) {  		seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");  		return 0;  	} @@ -1959,6 +2016,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)  		 (unsigned long long) server->fsid.major,  		 (unsigned long long) server->fsid.minor); +	rcu_read_lock();  	seq_printf(m, "v%u %s %s %-7s %-17s %s\n",  		   clp->rpc_ops->version,  		   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), @@ -1966,6 +2024,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)  		   dev,  		   fsid,  		   nfs_server_fscache_state(server)); +	rcu_read_unlock();  	return 0;  } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 7f265406980..89af1d26927 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -105,7 +105,7 @@ again:  			continue;  		if (!test_bit(NFS_DELEGATED_STATE, &state->flags))  			continue; -		if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) +		if (!nfs4_stateid_match(&state->stateid, stateid))  			continue;  		get_nfs_open_context(ctx);  		spin_unlock(&inode->i_lock); @@ -139,8 +139,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,  	if (delegation != NULL) {  		spin_lock(&delegation->lock);  		if (delegation->inode != NULL) { -			memcpy(delegation->stateid.data, res->delegation.data, -			       sizeof(delegation->stateid.data)); +			nfs4_stateid_copy(&delegation->stateid, &res->delegation);  			delegation->type = res->delegation_type;  			delegation->maxsize = res->maxsize;  			oldcred = delegation->cred; @@ -236,8 +235,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct  	delegation = kmalloc(sizeof(*delegation), GFP_NOFS);  	if (delegation == NULL)  		return -ENOMEM; -	memcpy(delegation->stateid.data, res->delegation.data, -			sizeof(delegation->stateid.data)); +	nfs4_stateid_copy(&delegation->stateid, &res->delegation);  	delegation->type = res->delegation_type;  	delegation->maxsize = res->maxsize;  	delegation->change_attr = inode->i_version; @@ -250,19 +248,22 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct  	old_delegation = rcu_dereference_protected(nfsi->delegation,  					lockdep_is_held(&clp->cl_lock));  	if (old_delegation != NULL) { -		if (memcmp(&delegation->stateid, &old_delegation->stateid, -					sizeof(old_delegation->stateid)) == 0 && +		if (nfs4_stateid_match(&delegation->stateid, +					&old_delegation->stateid) &&  				delegation->type == old_delegation->type) {  			goto out;  		}  		/*  		 * Deal with broken servers that hand out two  		 * delegations for the same file. +		 * Allow for upgrades to a WRITE delegation, but +		 * nothing else.  		 */  		dfprintk(FILE, "%s: server %s handed out "  				"a duplicate delegation!\n",  				__func__, clp->cl_hostname); -		if (delegation->type <= old_delegation->type) { +		if (delegation->type == old_delegation->type || +		    !(delegation->type & FMODE_WRITE)) {  			freeme = delegation;  			delegation = NULL;  			goto out; @@ -455,17 +456,24 @@ static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,  	rcu_read_unlock();  } -static void nfs_client_mark_return_all_delegations(struct nfs_client *clp) -{ -	nfs_client_mark_return_all_delegation_types(clp, FMODE_READ|FMODE_WRITE); -} -  static void nfs_delegation_run_state_manager(struct nfs_client *clp)  {  	if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))  		nfs4_schedule_state_manager(clp);  } +void nfs_remove_bad_delegation(struct inode *inode) +{ +	struct nfs_delegation *delegation; + +	delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode)); +	if (delegation) { +		nfs_inode_find_state_and_recover(inode, &delegation->stateid); +		nfs_free_delegation(delegation); +	} +} +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); +  /**   * nfs_expire_all_delegation_types   * @clp: client to process @@ -488,18 +496,6 @@ void nfs_expire_all_delegations(struct nfs_client *clp)  	nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);  } -/** - * nfs_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN - * @clp: client to process - * - */ -void nfs_handle_cb_pathdown(struct nfs_client *clp) -{ -	if (clp == NULL) -		return; -	nfs_client_mark_return_all_delegations(clp); -} -  static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)  {  	struct nfs_delegation *delegation; @@ -531,7 +527,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)  /**   * nfs_async_inode_return_delegation - asynchronously return a delegation   * @inode: inode to process - * @stateid: state ID information from CB_RECALL arguments + * @stateid: state ID information   *   * Returns zero on success, or a negative errno value.   */ @@ -545,7 +541,7 @@ int nfs_async_inode_return_delegation(struct inode *inode,  	rcu_read_lock();  	delegation = rcu_dereference(NFS_I(inode)->delegation); -	if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { +	if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) {  		rcu_read_unlock();  		return -ENOENT;  	} @@ -684,21 +680,25 @@ int nfs_delegations_present(struct nfs_client *clp)   * nfs4_copy_delegation_stateid - Copy inode's state ID information   * @dst: stateid data structure to fill in   * @inode: inode to check + * @flags: delegation type requirement   * - * Returns one and fills in "dst->data" * if inode had a delegation, - * otherwise zero is returned. + * Returns "true" and fills in "dst->data" * if inode had a delegation, + * otherwise "false" is returned.   */ -int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) +bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, +		fmode_t flags)  {  	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_delegation *delegation; -	int ret = 0; +	bool ret; +	flags &= FMODE_READ|FMODE_WRITE;  	rcu_read_lock();  	delegation = rcu_dereference(nfsi->delegation); -	if (delegation != NULL) { -		memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); -		ret = 1; +	ret = (delegation != NULL && (delegation->type & flags) == flags); +	if (ret) { +		nfs4_stateid_copy(dst, &delegation->stateid); +		nfs_mark_delegation_referenced(delegation);  	}  	rcu_read_unlock();  	return ret; diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index d9322e490c5..cd6a7a8dada 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -42,9 +42,9 @@ void nfs_super_return_all_delegations(struct super_block *sb);  void nfs_expire_all_delegations(struct nfs_client *clp);  void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);  void nfs_expire_unreferenced_delegations(struct nfs_client *clp); -void nfs_handle_cb_pathdown(struct nfs_client *clp);  int nfs_client_return_marked_delegations(struct nfs_client *clp);  int nfs_delegations_present(struct nfs_client *clp); +void nfs_remove_bad_delegation(struct inode *inode);  void nfs_delegation_mark_reclaim(struct nfs_client *clp);  void nfs_delegation_reap_unclaimed(struct nfs_client *clp); @@ -53,7 +53,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);  int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);  int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);  int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl); -int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode); +bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);  void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);  int nfs_have_delegation(struct inode *inode, fmode_t flags); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index fd9a872fada..4aaf0316d76 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -207,7 +207,7 @@ struct nfs_cache_array_entry {  };  struct nfs_cache_array { -	unsigned int size; +	int size;  	int eof_index;  	u64 last_cookie;  	struct nfs_cache_array_entry array[0]; @@ -260,10 +260,10 @@ void nfs_readdir_clear_array(struct page *page)  	struct nfs_cache_array *array;  	int i; -	array = kmap_atomic(page, KM_USER0); +	array = kmap_atomic(page);  	for (i = 0; i < array->size; i++)  		kfree(array->array[i].string.name); -	kunmap_atomic(array, KM_USER0); +	kunmap_atomic(array);  }  /* @@ -1429,6 +1429,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry  	}  	open_flags = nd->intent.open.flags; +	attr.ia_valid = 0;  	ctx = create_nfs_open_context(dentry, open_flags);  	res = ERR_CAST(ctx); @@ -1437,11 +1438,14 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry  	if (nd->flags & LOOKUP_CREATE) {  		attr.ia_mode = nd->intent.open.create_mode; -		attr.ia_valid = ATTR_MODE; +		attr.ia_valid |= ATTR_MODE;  		attr.ia_mode &= ~current_umask(); -	} else { +	} else  		open_flags &= ~(O_EXCL | O_CREAT); -		attr.ia_valid = 0; + +	if (open_flags & O_TRUNC) { +		attr.ia_valid |= ATTR_SIZE; +		attr.ia_size = 0;  	}  	/* Open the file on the server */ @@ -1495,6 +1499,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)  	struct inode *inode;  	struct inode *dir;  	struct nfs_open_context *ctx; +	struct iattr attr;  	int openflags, ret = 0;  	if (nd->flags & LOOKUP_RCU) @@ -1523,19 +1528,27 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)  	/* We cannot do exclusive creation on a positive dentry */  	if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))  		goto no_open_dput; -	/* We can't create new files, or truncate existing ones here */ -	openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); +	/* We can't create new files here */ +	openflags &= ~(O_CREAT|O_EXCL);  	ctx = create_nfs_open_context(dentry, openflags);  	ret = PTR_ERR(ctx);  	if (IS_ERR(ctx))  		goto out; + +	attr.ia_valid = 0; +	if (openflags & O_TRUNC) { +		attr.ia_valid |= ATTR_SIZE; +		attr.ia_size = 0; +		nfs_wb_all(inode); +	} +  	/*  	 * Note: we're not holding inode->i_mutex and so may be racing with  	 * operations that change the directory. We therefore save the  	 * change attribute *before* we do the RPC call.  	 */ -	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, NULL); +	inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);  	if (IS_ERR(inode)) {  		ret = PTR_ERR(inode);  		switch (ret) { @@ -1870,11 +1883,11 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym  	if (!page)  		return -ENOMEM; -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	memcpy(kaddr, symname, pathlen);  	if (pathlen < PAGE_SIZE)  		memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);  	if (error != 0) { diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 1940f1a56a5..9c7f66ac6cc 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -265,9 +265,7 @@ static void nfs_direct_read_release(void *calldata)  }  static const struct rpc_call_ops nfs_read_direct_ops = { -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_read_prepare, -#endif /* CONFIG_NFS_V4_1 */  	.rpc_call_done = nfs_direct_read_result,  	.rpc_release = nfs_direct_read_release,  }; @@ -554,9 +552,7 @@ static void nfs_direct_commit_release(void *calldata)  }  static const struct rpc_call_ops nfs_commit_direct_ops = { -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */  	.rpc_call_done = nfs_direct_commit_result,  	.rpc_release = nfs_direct_commit_release,  }; @@ -696,9 +692,7 @@ out_unlock:  }  static const struct rpc_call_ops nfs_write_direct_ops = { -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */  	.rpc_call_done = nfs_direct_write_result,  	.rpc_release = nfs_direct_write_release,  }; diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index a6e711ad130..b3924b8a600 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -10,8 +10,9 @@  #include <linux/sunrpc/clnt.h>  #include <linux/dns_resolver.h> +#include "dns_resolve.h" -ssize_t nfs_dns_resolve_name(char *name, size_t namelen, +ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,  		struct sockaddr *sa, size_t salen)  {  	ssize_t ret; @@ -20,7 +21,7 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,  	ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);  	if (ip_len > 0) -		ret = rpc_pton(ip_addr, ip_len, sa, salen); +		ret = rpc_pton(net, ip_addr, ip_len, sa, salen);  	else  		ret = -ESRCH;  	kfree(ip_addr); @@ -40,15 +41,15 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,  #include <linux/sunrpc/clnt.h>  #include <linux/sunrpc/cache.h>  #include <linux/sunrpc/svcauth.h> +#include <linux/sunrpc/rpc_pipe_fs.h>  #include "dns_resolve.h"  #include "cache_lib.h" +#include "netns.h"  #define NFS_DNS_HASHBITS 4  #define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) -static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE]; -  struct nfs_dns_ent {  	struct cache_head h; @@ -224,7 +225,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)  	len = qword_get(&buf, buf1, sizeof(buf1));  	if (len <= 0)  		goto out; -	key.addrlen = rpc_pton(buf1, len, +	key.addrlen = rpc_pton(cd->net, buf1, len,  			(struct sockaddr *)&key.addr,  			sizeof(key.addr)); @@ -259,21 +260,6 @@ out:  	return ret;  } -static struct cache_detail nfs_dns_resolve = { -	.owner = THIS_MODULE, -	.hash_size = NFS_DNS_HASHTBL_SIZE, -	.hash_table = nfs_dns_table, -	.name = "dns_resolve", -	.cache_put = nfs_dns_ent_put, -	.cache_upcall = nfs_dns_upcall, -	.cache_parse = nfs_dns_parse, -	.cache_show = nfs_dns_show, -	.match = nfs_dns_match, -	.init = nfs_dns_ent_init, -	.update = nfs_dns_ent_update, -	.alloc = nfs_dns_ent_alloc, -}; -  static int do_cache_lookup(struct cache_detail *cd,  		struct nfs_dns_ent *key,  		struct nfs_dns_ent **item, @@ -336,8 +322,8 @@ out:  	return ret;  } -ssize_t nfs_dns_resolve_name(char *name, size_t namelen, -		struct sockaddr *sa, size_t salen) +ssize_t nfs_dns_resolve_name(struct net *net, char *name, +		size_t namelen, struct sockaddr *sa, size_t salen)  {  	struct nfs_dns_ent key = {  		.hostname = name, @@ -345,28 +331,118 @@ ssize_t nfs_dns_resolve_name(char *name, size_t namelen,  	};  	struct nfs_dns_ent *item = NULL;  	ssize_t ret; +	struct nfs_net *nn = net_generic(net, nfs_net_id); -	ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item); +	ret = do_cache_lookup_wait(nn->nfs_dns_resolve, &key, &item);  	if (ret == 0) {  		if (salen >= item->addrlen) {  			memcpy(sa, &item->addr, item->addrlen);  			ret = item->addrlen;  		} else  			ret = -EOVERFLOW; -		cache_put(&item->h, &nfs_dns_resolve); +		cache_put(&item->h, nn->nfs_dns_resolve);  	} else if (ret == -ENOENT)  		ret = -ESRCH;  	return ret;  } +int nfs_dns_resolver_cache_init(struct net *net) +{ +	int err = -ENOMEM; +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	struct cache_detail *cd; +	struct cache_head **tbl; + +	cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL); +	if (cd == NULL) +		goto err_cd; + +	tbl = kzalloc(NFS_DNS_HASHTBL_SIZE * sizeof(struct cache_head *), +			GFP_KERNEL); +	if (tbl == NULL) +		goto err_tbl; + +	cd->owner = THIS_MODULE, +	cd->hash_size = NFS_DNS_HASHTBL_SIZE, +	cd->hash_table = tbl, +	cd->name = "dns_resolve", +	cd->cache_put = nfs_dns_ent_put, +	cd->cache_upcall = nfs_dns_upcall, +	cd->cache_parse = nfs_dns_parse, +	cd->cache_show = nfs_dns_show, +	cd->match = nfs_dns_match, +	cd->init = nfs_dns_ent_init, +	cd->update = nfs_dns_ent_update, +	cd->alloc = nfs_dns_ent_alloc, + +	nfs_cache_init(cd); +	err = nfs_cache_register_net(net, cd); +	if (err) +		goto err_reg; +	nn->nfs_dns_resolve = cd; +	return 0; + +err_reg: +	nfs_cache_destroy(cd); +	kfree(cd->hash_table); +err_tbl: +	kfree(cd); +err_cd: +	return err; +} + +void nfs_dns_resolver_cache_destroy(struct net *net) +{ +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	struct cache_detail *cd = nn->nfs_dns_resolve; + +	nfs_cache_unregister_net(net, cd); +	nfs_cache_destroy(cd); +	kfree(cd->hash_table); +	kfree(cd); +} + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, +			   void *ptr) +{ +	struct super_block *sb = ptr; +	struct net *net = sb->s_fs_info; +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	struct cache_detail *cd = nn->nfs_dns_resolve; +	int ret = 0; + +	if (cd == NULL) +		return 0; + +	if (!try_module_get(THIS_MODULE)) +		return 0; + +	switch (event) { +	case RPC_PIPEFS_MOUNT: +		ret = nfs_cache_register_sb(sb, cd); +		break; +	case RPC_PIPEFS_UMOUNT: +		nfs_cache_unregister_sb(sb, cd); +		break; +	default: +		ret = -ENOTSUPP; +		break; +	} +	module_put(THIS_MODULE); +	return ret; +} + +static struct notifier_block nfs_dns_resolver_block = { +	.notifier_call	= rpc_pipefs_event, +}; +  int nfs_dns_resolver_init(void)  { -	return nfs_cache_register(&nfs_dns_resolve); +	return rpc_pipefs_notifier_register(&nfs_dns_resolver_block);  }  void nfs_dns_resolver_destroy(void)  { -	nfs_cache_unregister(&nfs_dns_resolve); +	rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);  } -  #endif diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h index 199bb5543a9..2e4f596d292 100644 --- a/fs/nfs/dns_resolve.h +++ b/fs/nfs/dns_resolve.h @@ -15,12 +15,22 @@ static inline int nfs_dns_resolver_init(void)  static inline void nfs_dns_resolver_destroy(void)  {} + +static inline int nfs_dns_resolver_cache_init(struct net *net) +{ +	return 0; +} + +static inline void nfs_dns_resolver_cache_destroy(struct net *net) +{}  #else  extern int nfs_dns_resolver_init(void);  extern void nfs_dns_resolver_destroy(void); +extern int nfs_dns_resolver_cache_init(struct net *net); +extern void nfs_dns_resolver_cache_destroy(struct net *net);  #endif -extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, -		struct sockaddr *sa, size_t salen); +extern ssize_t nfs_dns_resolve_name(struct net *net, char *name, +		size_t namelen,	struct sockaddr *sa, size_t salen);  #endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c43a452f7da..4fdaaa63cf1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -530,6 +530,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	if (mapping != dentry->d_inode->i_mapping)  		goto out_unlock; +	wait_on_page_writeback(page); +  	pagelen = nfs_page_length(page);  	if (pagelen == 0)  		goto out_unlock; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 419119c371b..ae65c16b367 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -327,7 +327,7 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)  {  	struct nfs_inode *nfsi = NFS_I(inode);  	struct nfs_server *nfss = NFS_SERVER(inode); -	struct fscache_cookie *old = nfsi->fscache; +	NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache);  	nfs_fscache_inode_lock(inode);  	if (nfsi->fscache) { diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index dcb61548887..801d6d83078 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -49,11 +49,9 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i  {  	/* The mntroot acts as the dummy root dentry for this superblock */  	if (sb->s_root == NULL) { -		sb->s_root = d_alloc_root(inode); -		if (sb->s_root == NULL) { -			iput(inode); +		sb->s_root = d_make_root(inode); +		if (sb->s_root == NULL)  			return -ENOMEM; -		}  		ihold(inode);  		/*  		 * Ensure that this dentry is invisible to d_find_alias(). diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 2c05f1991e1..b7f348bb618 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -34,11 +34,29 @@   *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.   */  #include <linux/types.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/slab.h> +#include <linux/parser.h> +#include <linux/fs.h>  #include <linux/nfs_idmap.h> +#include <net/net_namespace.h> +#include <linux/sunrpc/rpc_pipe_fs.h>  #include <linux/nfs_fs.h> +#include <linux/nfs_fs_sb.h> +#include <linux/key.h> +#include <linux/keyctl.h> +#include <linux/key-type.h> +#include <keys/user-type.h> +#include <linux/module.h> + +#include "internal.h" +#include "netns.h" + +#define NFS_UINT_MAXLEN 11 + +/* Default cache timeout is 10 minutes */ +unsigned int nfs_idmap_cache_timeout = 600; +static const struct cred *id_resolver_cache; +static struct key_type key_type_id_resolver_legacy; +  /**   * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields @@ -142,24 +160,7 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)  	return snprintf(buf, buflen, "%u", id);  } -#ifdef CONFIG_NFS_USE_NEW_IDMAPPER - -#include <linux/cred.h> -#include <linux/sunrpc/sched.h> -#include <linux/nfs4.h> -#include <linux/nfs_fs_sb.h> -#include <linux/keyctl.h> -#include <linux/key-type.h> -#include <linux/rcupdate.h> -#include <linux/err.h> - -#include <keys/user-type.h> - -#define NFS_UINT_MAXLEN 11 - -const struct cred *id_resolver_cache; - -struct key_type key_type_id_resolver = { +static struct key_type key_type_id_resolver = {  	.name		= "id_resolver",  	.instantiate	= user_instantiate,  	.match		= user_match, @@ -169,13 +170,14 @@ struct key_type key_type_id_resolver = {  	.read		= user_read,  }; -int nfs_idmap_init(void) +static int nfs_idmap_init_keyring(void)  {  	struct cred *cred;  	struct key *keyring;  	int ret = 0; -	printk(KERN_NOTICE "Registering the %s key type\n", key_type_id_resolver.name); +	printk(KERN_NOTICE "NFS: Registering the %s key type\n", +		key_type_id_resolver.name);  	cred = prepare_kernel_cred(NULL);  	if (!cred) @@ -198,6 +200,7 @@ int nfs_idmap_init(void)  	if (ret < 0)  		goto failed_put_key; +	set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags);  	cred->thread_keyring = keyring;  	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;  	id_resolver_cache = cred; @@ -210,7 +213,7 @@ failed_put_cred:  	return ret;  } -void nfs_idmap_quit(void) +static void nfs_idmap_quit_keyring(void)  {  	key_revoke(id_resolver_cache->thread_keyring);  	unregister_key_type(&key_type_id_resolver); @@ -245,8 +248,10 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,  	return desclen;  } -static ssize_t nfs_idmap_request_key(const char *name, size_t namelen, -		const char *type, void *data, size_t data_size) +static ssize_t nfs_idmap_request_key(struct key_type *key_type, +				     const char *name, size_t namelen, +				     const char *type, void *data, +				     size_t data_size, struct idmap *idmap)  {  	const struct cred *saved_cred;  	struct key *rkey; @@ -259,8 +264,12 @@ static ssize_t nfs_idmap_request_key(const char *name, size_t namelen,  		goto out;  	saved_cred = override_creds(id_resolver_cache); -	rkey = request_key(&key_type_id_resolver, desc, ""); +	if (idmap) +		rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap); +	else +		rkey = request_key(&key_type_id_resolver, desc, "");  	revert_creds(saved_cred); +  	kfree(desc);  	if (IS_ERR(rkey)) {  		ret = PTR_ERR(rkey); @@ -293,31 +302,46 @@ out:  	return ret;  } +static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, +				 const char *type, void *data, +				 size_t data_size, struct idmap *idmap) +{ +	ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver, +					    name, namelen, type, data, +					    data_size, NULL); +	if (ret < 0) { +		ret = nfs_idmap_request_key(&key_type_id_resolver_legacy, +					    name, namelen, type, data, +					    data_size, idmap); +	} +	return ret; +}  /* ID -> Name */ -static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen) +static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, +				     size_t buflen, struct idmap *idmap)  {  	char id_str[NFS_UINT_MAXLEN];  	int id_len;  	ssize_t ret;  	id_len = snprintf(id_str, sizeof(id_str), "%u", id); -	ret = nfs_idmap_request_key(id_str, id_len, type, buf, buflen); +	ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap);  	if (ret < 0)  		return -EINVAL;  	return ret;  }  /* Name -> ID */ -static int nfs_idmap_lookup_id(const char *name, size_t namelen, -				const char *type, __u32 *id) +static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type, +			       __u32 *id, struct idmap *idmap)  {  	char id_str[NFS_UINT_MAXLEN];  	long id_long;  	ssize_t data_size;  	int ret = 0; -	data_size = nfs_idmap_request_key(name, namelen, type, id_str, NFS_UINT_MAXLEN); +	data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap);  	if (data_size <= 0) {  		ret = -EINVAL;  	} else { @@ -327,114 +351,103 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,  	return ret;  } -int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) -{ -	if (nfs_map_string_to_numeric(name, namelen, uid)) -		return 0; -	return nfs_idmap_lookup_id(name, namelen, "uid", uid); -} - -int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid) -{ -	if (nfs_map_string_to_numeric(name, namelen, gid)) -		return 0; -	return nfs_idmap_lookup_id(name, namelen, "gid", gid); -} - -int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) -{ -	int ret = -EINVAL; - -	if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) -		ret = nfs_idmap_lookup_name(uid, "user", buf, buflen); -	if (ret < 0) -		ret = nfs_map_numeric_to_string(uid, buf, buflen); -	return ret; -} -int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) -{ -	int ret = -EINVAL; +/* idmap classic begins here */ +module_param(nfs_idmap_cache_timeout, int, 0644); -	if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) -		ret = nfs_idmap_lookup_name(gid, "group", buf, buflen); -	if (ret < 0) -		ret = nfs_map_numeric_to_string(gid, buf, buflen); -	return ret; -} - -#else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */ - -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/init.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/sched.h> -#include <linux/sunrpc/clnt.h> -#include <linux/workqueue.h> -#include <linux/sunrpc/rpc_pipe_fs.h> - -#include <linux/nfs_fs.h> - -#include "nfs4_fs.h" - -#define IDMAP_HASH_SZ          128 - -/* Default cache timeout is 10 minutes */ -unsigned int nfs_idmap_cache_timeout = 600 * HZ; - -static int param_set_idmap_timeout(const char *val, struct kernel_param *kp) -{ -	char *endp; -	int num = simple_strtol(val, &endp, 0); -	int jif = num * HZ; -	if (endp == val || *endp || num < 0 || jif < num) -		return -EINVAL; -	*((int *)kp->arg) = jif; -	return 0; -} - -module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int, -		 &nfs_idmap_cache_timeout, 0644); - -struct idmap_hashent { -	unsigned long		ih_expires; -	__u32			ih_id; -	size_t			ih_namelen; -	char			ih_name[IDMAP_NAMESZ]; +struct idmap { +	struct rpc_pipe		*idmap_pipe; +	struct key_construction	*idmap_key_cons;  }; -struct idmap_hashtable { -	__u8			h_type; -	struct idmap_hashent	h_entries[IDMAP_HASH_SZ]; +enum { +	Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err  }; -struct idmap { -	struct dentry		*idmap_dentry; -	wait_queue_head_t	idmap_wq; -	struct idmap_msg	idmap_im; -	struct mutex		idmap_lock;	/* Serializes upcalls */ -	struct mutex		idmap_im_lock;	/* Protects the hashtable */ -	struct idmap_hashtable	idmap_user_hash; -	struct idmap_hashtable	idmap_group_hash; +static const match_table_t nfs_idmap_tokens = { +	{ Opt_find_uid, "uid:%s" }, +	{ Opt_find_gid, "gid:%s" }, +	{ Opt_find_user, "user:%s" }, +	{ Opt_find_group, "group:%s" }, +	{ Opt_find_err, NULL }  }; +static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);  static ssize_t idmap_pipe_downcall(struct file *, const char __user *,  				   size_t);  static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); -static unsigned int fnvhash32(const void *, size_t); -  static const struct rpc_pipe_ops idmap_upcall_ops = {  	.upcall		= rpc_pipe_generic_upcall,  	.downcall	= idmap_pipe_downcall,  	.destroy_msg	= idmap_pipe_destroy_msg,  }; +static struct key_type key_type_id_resolver_legacy = { +	.name		= "id_resolver", +	.instantiate	= user_instantiate, +	.match		= user_match, +	.revoke		= user_revoke, +	.destroy	= user_destroy, +	.describe	= user_describe, +	.read		= user_read, +	.request_key	= nfs_idmap_legacy_upcall, +}; + +static void __nfs_idmap_unregister(struct rpc_pipe *pipe) +{ +	if (pipe->dentry) +		rpc_unlink(pipe->dentry); +} + +static int __nfs_idmap_register(struct dentry *dir, +				     struct idmap *idmap, +				     struct rpc_pipe *pipe) +{ +	struct dentry *dentry; + +	dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe); +	if (IS_ERR(dentry)) +		return PTR_ERR(dentry); +	pipe->dentry = dentry; +	return 0; +} + +static void nfs_idmap_unregister(struct nfs_client *clp, +				      struct rpc_pipe *pipe) +{ +	struct net *net = clp->net; +	struct super_block *pipefs_sb; + +	pipefs_sb = rpc_get_sb_net(net); +	if (pipefs_sb) { +		__nfs_idmap_unregister(pipe); +		rpc_put_sb_net(net); +	} +} + +static int nfs_idmap_register(struct nfs_client *clp, +				   struct idmap *idmap, +				   struct rpc_pipe *pipe) +{ +	struct net *net = clp->net; +	struct super_block *pipefs_sb; +	int err = 0; + +	pipefs_sb = rpc_get_sb_net(net); +	if (pipefs_sb) { +		if (clp->cl_rpcclient->cl_dentry) +			err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry, +						   idmap, pipe); +		rpc_put_sb_net(net); +	} +	return err; +} +  int  nfs_idmap_new(struct nfs_client *clp)  {  	struct idmap *idmap; +	struct rpc_pipe *pipe;  	int error;  	BUG_ON(clp->cl_idmap != NULL); @@ -443,19 +456,19 @@ nfs_idmap_new(struct nfs_client *clp)  	if (idmap == NULL)  		return -ENOMEM; -	idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry, -			"idmap", idmap, &idmap_upcall_ops, 0); -	if (IS_ERR(idmap->idmap_dentry)) { -		error = PTR_ERR(idmap->idmap_dentry); +	pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0); +	if (IS_ERR(pipe)) { +		error = PTR_ERR(pipe);  		kfree(idmap);  		return error;  	} - -	mutex_init(&idmap->idmap_lock); -	mutex_init(&idmap->idmap_im_lock); -	init_waitqueue_head(&idmap->idmap_wq); -	idmap->idmap_user_hash.h_type = IDMAP_TYPE_USER; -	idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP; +	error = nfs_idmap_register(clp, idmap, pipe); +	if (error) { +		rpc_destroy_pipe_data(pipe); +		kfree(idmap); +		return error; +	} +	idmap->idmap_pipe = pipe;  	clp->cl_idmap = idmap;  	return 0; @@ -468,211 +481,220 @@ nfs_idmap_delete(struct nfs_client *clp)  	if (!idmap)  		return; -	rpc_unlink(idmap->idmap_dentry); +	nfs_idmap_unregister(clp, idmap->idmap_pipe); +	rpc_destroy_pipe_data(idmap->idmap_pipe);  	clp->cl_idmap = NULL;  	kfree(idmap);  } -/* - * Helper routines for manipulating the hashtable - */ -static inline struct idmap_hashent * -idmap_name_hash(struct idmap_hashtable* h, const char *name, size_t len) +static int __rpc_pipefs_event(struct nfs_client *clp, unsigned long event, +			      struct super_block *sb)  { -	return &h->h_entries[fnvhash32(name, len) % IDMAP_HASH_SZ]; -} +	int err = 0; -static struct idmap_hashent * -idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len) -{ -	struct idmap_hashent *he = idmap_name_hash(h, name, len); +	switch (event) { +	case RPC_PIPEFS_MOUNT: +		BUG_ON(clp->cl_rpcclient->cl_dentry == NULL); +		err = __nfs_idmap_register(clp->cl_rpcclient->cl_dentry, +						clp->cl_idmap, +						clp->cl_idmap->idmap_pipe); +		break; +	case RPC_PIPEFS_UMOUNT: +		if (clp->cl_idmap->idmap_pipe) { +			struct dentry *parent; -	if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0) -		return NULL; -	if (time_after(jiffies, he->ih_expires)) -		return NULL; -	return he; +			parent = clp->cl_idmap->idmap_pipe->dentry->d_parent; +			__nfs_idmap_unregister(clp->cl_idmap->idmap_pipe); +			/* +			 * Note: This is a dirty hack. SUNRPC hook has been +			 * called already but simple_rmdir() call for the +			 * directory returned with error because of idmap pipe +			 * inside. Thus now we have to remove this directory +			 * here. +			 */ +			if (rpc_rmdir(parent)) +				printk(KERN_ERR "NFS: %s: failed to remove " +					"clnt dir!\n", __func__); +		} +		break; +	default: +		printk(KERN_ERR "NFS: %s: unknown event: %ld\n", __func__, +			event); +		return -ENOTSUPP; +	} +	return err;  } -static inline struct idmap_hashent * -idmap_id_hash(struct idmap_hashtable* h, __u32 id) +static struct nfs_client *nfs_get_client_for_event(struct net *net, int event)  { -	return &h->h_entries[fnvhash32(&id, sizeof(id)) % IDMAP_HASH_SZ]; -} +	struct nfs_net *nn = net_generic(net, nfs_net_id); +	struct dentry *cl_dentry; +	struct nfs_client *clp; -static struct idmap_hashent * -idmap_lookup_id(struct idmap_hashtable *h, __u32 id) -{ -	struct idmap_hashent *he = idmap_id_hash(h, id); -	if (he->ih_id != id || he->ih_namelen == 0) -		return NULL; -	if (time_after(jiffies, he->ih_expires)) -		return NULL; -	return he; +	spin_lock(&nn->nfs_client_lock); +	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { +		if (clp->rpc_ops != &nfs_v4_clientops) +			continue; +		cl_dentry = clp->cl_idmap->idmap_pipe->dentry; +		if (((event == RPC_PIPEFS_MOUNT) && cl_dentry) || +		    ((event == RPC_PIPEFS_UMOUNT) && !cl_dentry)) +			continue; +		atomic_inc(&clp->cl_count); +		spin_unlock(&nn->nfs_client_lock); +		return clp; +	} +	spin_unlock(&nn->nfs_client_lock); +	return NULL;  } -/* - * Routines for allocating new entries in the hashtable. - * For now, we just have 1 entry per bucket, so it's all - * pretty trivial. - */ -static inline struct idmap_hashent * -idmap_alloc_name(struct idmap_hashtable *h, char *name, size_t len) +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, +			    void *ptr)  { -	return idmap_name_hash(h, name, len); +	struct super_block *sb = ptr; +	struct nfs_client *clp; +	int error = 0; + +	while ((clp = nfs_get_client_for_event(sb->s_fs_info, event))) { +		error = __rpc_pipefs_event(clp, event, sb); +		nfs_put_client(clp); +		if (error) +			break; +	} +	return error;  } -static inline struct idmap_hashent * -idmap_alloc_id(struct idmap_hashtable *h, __u32 id) +#define PIPEFS_NFS_PRIO		1 + +static struct notifier_block nfs_idmap_block = { +	.notifier_call	= rpc_pipefs_event, +	.priority	= SUNRPC_PIPEFS_NFS_PRIO, +}; + +int nfs_idmap_init(void)  { -	return idmap_id_hash(h, id); +	int ret; +	ret = nfs_idmap_init_keyring(); +	if (ret != 0) +		goto out; +	ret = rpc_pipefs_notifier_register(&nfs_idmap_block); +	if (ret != 0) +		nfs_idmap_quit_keyring(); +out: +	return ret;  } -static void -idmap_update_entry(struct idmap_hashent *he, const char *name, -		size_t namelen, __u32 id) +void nfs_idmap_quit(void)  { -	he->ih_id = id; -	memcpy(he->ih_name, name, namelen); -	he->ih_name[namelen] = '\0'; -	he->ih_namelen = namelen; -	he->ih_expires = jiffies + nfs_idmap_cache_timeout; +	rpc_pipefs_notifier_unregister(&nfs_idmap_block); +	nfs_idmap_quit_keyring();  } -/* - * Name -> ID - */ -static int -nfs_idmap_id(struct idmap *idmap, struct idmap_hashtable *h, -		const char *name, size_t namelen, __u32 *id) +static int nfs_idmap_prepare_message(char *desc, struct idmap_msg *im, +				     struct rpc_pipe_msg *msg)  { -	struct rpc_pipe_msg msg; -	struct idmap_msg *im; -	struct idmap_hashent *he; -	DECLARE_WAITQUEUE(wq, current); -	int ret = -EIO; - -	im = &idmap->idmap_im; +	substring_t substr; +	int token, ret; -	/* -	 * String sanity checks -	 * Note that the userland daemon expects NUL terminated strings -	 */ -	for (;;) { -		if (namelen == 0) -			return -EINVAL; -		if (name[namelen-1] != '\0') -			break; -		namelen--; -	} -	if (namelen >= IDMAP_NAMESZ) -		return -EINVAL; - -	mutex_lock(&idmap->idmap_lock); -	mutex_lock(&idmap->idmap_im_lock); - -	he = idmap_lookup_name(h, name, namelen); -	if (he != NULL) { -		*id = he->ih_id; -		ret = 0; -		goto out; -	} +	memset(im,  0, sizeof(*im)); +	memset(msg, 0, sizeof(*msg)); -	memset(im, 0, sizeof(*im)); -	memcpy(im->im_name, name, namelen); +	im->im_type = IDMAP_TYPE_GROUP; +	token = match_token(desc, nfs_idmap_tokens, &substr); -	im->im_type = h->h_type; -	im->im_conv = IDMAP_CONV_NAMETOID; +	switch (token) { +	case Opt_find_uid: +		im->im_type = IDMAP_TYPE_USER; +	case Opt_find_gid: +		im->im_conv = IDMAP_CONV_NAMETOID; +		ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); +		break; -	memset(&msg, 0, sizeof(msg)); -	msg.data = im; -	msg.len = sizeof(*im); +	case Opt_find_user: +		im->im_type = IDMAP_TYPE_USER; +	case Opt_find_group: +		im->im_conv = IDMAP_CONV_IDTONAME; +		ret = match_int(&substr, &im->im_id); +		break; -	add_wait_queue(&idmap->idmap_wq, &wq); -	if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { -		remove_wait_queue(&idmap->idmap_wq, &wq); +	default: +		ret = -EINVAL;  		goto out;  	} -	set_current_state(TASK_UNINTERRUPTIBLE); -	mutex_unlock(&idmap->idmap_im_lock); -	schedule(); -	__set_current_state(TASK_RUNNING); -	remove_wait_queue(&idmap->idmap_wq, &wq); -	mutex_lock(&idmap->idmap_im_lock); - -	if (im->im_status & IDMAP_STATUS_SUCCESS) { -		*id = im->im_id; -		ret = 0; -	} +	msg->data = im; +	msg->len  = sizeof(struct idmap_msg); - out: -	memset(im, 0, sizeof(*im)); -	mutex_unlock(&idmap->idmap_im_lock); -	mutex_unlock(&idmap->idmap_lock); +out:  	return ret;  } -/* - * ID -> Name - */ -static int -nfs_idmap_name(struct idmap *idmap, struct idmap_hashtable *h, -		__u32 id, char *name) +static int nfs_idmap_legacy_upcall(struct key_construction *cons, +				   const char *op, +				   void *aux)  { -	struct rpc_pipe_msg msg; +	struct rpc_pipe_msg *msg;  	struct idmap_msg *im; -	struct idmap_hashent *he; -	DECLARE_WAITQUEUE(wq, current); -	int ret = -EIO; -	unsigned int len; - -	im = &idmap->idmap_im; +	struct idmap *idmap = (struct idmap *)aux; +	struct key *key = cons->key; +	int ret; -	mutex_lock(&idmap->idmap_lock); -	mutex_lock(&idmap->idmap_im_lock); +	/* msg and im are freed in idmap_pipe_destroy_msg */ +	msg = kmalloc(sizeof(*msg), GFP_KERNEL); +	if (IS_ERR(msg)) { +		ret = PTR_ERR(msg); +		goto out0; +	} -	he = idmap_lookup_id(h, id); -	if (he) { -		memcpy(name, he->ih_name, he->ih_namelen); -		ret = he->ih_namelen; -		goto out; +	im = kmalloc(sizeof(*im), GFP_KERNEL); +	if (IS_ERR(im)) { +		ret = PTR_ERR(im); +		goto out1;  	} -	memset(im, 0, sizeof(*im)); -	im->im_type = h->h_type; -	im->im_conv = IDMAP_CONV_IDTONAME; -	im->im_id = id; +	ret = nfs_idmap_prepare_message(key->description, im, msg); +	if (ret < 0) +		goto out2; -	memset(&msg, 0, sizeof(msg)); -	msg.data = im; -	msg.len = sizeof(*im); +	idmap->idmap_key_cons = cons; -	add_wait_queue(&idmap->idmap_wq, &wq); +	ret = rpc_queue_upcall(idmap->idmap_pipe, msg); +	if (ret < 0) +		goto out2; -	if (rpc_queue_upcall(idmap->idmap_dentry->d_inode, &msg) < 0) { -		remove_wait_queue(&idmap->idmap_wq, &wq); -		goto out; -	} +	return ret; + +out2: +	kfree(im); +out1: +	kfree(msg); +out0: +	key_revoke(cons->key); +	key_revoke(cons->authkey); +	return ret; +} -	set_current_state(TASK_UNINTERRUPTIBLE); -	mutex_unlock(&idmap->idmap_im_lock); -	schedule(); -	__set_current_state(TASK_RUNNING); -	remove_wait_queue(&idmap->idmap_wq, &wq); -	mutex_lock(&idmap->idmap_im_lock); +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data) +{ +	return key_instantiate_and_link(key, data, strlen(data) + 1, +					id_resolver_cache->thread_keyring, +					authkey); +} -	if (im->im_status & IDMAP_STATUS_SUCCESS) { -		if ((len = strnlen(im->im_name, IDMAP_NAMESZ)) == 0) -			goto out; -		memcpy(name, im->im_name, len); -		ret = len; +static int nfs_idmap_read_message(struct idmap_msg *im, struct key *key, struct key *authkey) +{ +	char id_str[NFS_UINT_MAXLEN]; +	int ret = -EINVAL; + +	switch (im->im_conv) { +	case IDMAP_CONV_NAMETOID: +		sprintf(id_str, "%d", im->im_id); +		ret = nfs_idmap_instantiate(key, authkey, id_str); +		break; +	case IDMAP_CONV_IDTONAME: +		ret = nfs_idmap_instantiate(key, authkey, im->im_name); +		break;  	} - out: -	memset(im, 0, sizeof(*im)); -	mutex_unlock(&idmap->idmap_im_lock); -	mutex_unlock(&idmap->idmap_lock);  	return ret;  } @@ -681,115 +703,51 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)  {  	struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);  	struct idmap *idmap = (struct idmap *)rpci->private; -	struct idmap_msg im_in, *im = &idmap->idmap_im; -	struct idmap_hashtable *h; -	struct idmap_hashent *he = NULL; +	struct key_construction *cons = idmap->idmap_key_cons; +	struct idmap_msg im;  	size_t namelen_in;  	int ret; -	if (mlen != sizeof(im_in)) -		return -ENOSPC; - -	if (copy_from_user(&im_in, src, mlen) != 0) -		return -EFAULT; - -	mutex_lock(&idmap->idmap_im_lock); - -	ret = mlen; -	im->im_status = im_in.im_status; -	/* If we got an error, terminate now, and wake up pending upcalls */ -	if (!(im_in.im_status & IDMAP_STATUS_SUCCESS)) { -		wake_up(&idmap->idmap_wq); +	if (mlen != sizeof(im)) { +		ret = -ENOSPC;  		goto out;  	} -	/* Sanity checking of strings */ -	ret = -EINVAL; -	namelen_in = strnlen(im_in.im_name, IDMAP_NAMESZ); -	if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) +	if (copy_from_user(&im, src, mlen) != 0) { +		ret = -EFAULT;  		goto out; +	} -	switch (im_in.im_type) { -		case IDMAP_TYPE_USER: -			h = &idmap->idmap_user_hash; -			break; -		case IDMAP_TYPE_GROUP: -			h = &idmap->idmap_group_hash; -			break; -		default: -			goto out; +	if (!(im.im_status & IDMAP_STATUS_SUCCESS)) { +		ret = mlen; +		complete_request_key(idmap->idmap_key_cons, -ENOKEY); +		goto out_incomplete;  	} -	switch (im_in.im_conv) { -	case IDMAP_CONV_IDTONAME: -		/* Did we match the current upcall? */ -		if (im->im_conv == IDMAP_CONV_IDTONAME -				&& im->im_type == im_in.im_type -				&& im->im_id == im_in.im_id) { -			/* Yes: copy string, including the terminating '\0'  */ -			memcpy(im->im_name, im_in.im_name, namelen_in); -			im->im_name[namelen_in] = '\0'; -			wake_up(&idmap->idmap_wq); -		} -		he = idmap_alloc_id(h, im_in.im_id); -		break; -	case IDMAP_CONV_NAMETOID: -		/* Did we match the current upcall? */ -		if (im->im_conv == IDMAP_CONV_NAMETOID -				&& im->im_type == im_in.im_type -				&& strnlen(im->im_name, IDMAP_NAMESZ) == namelen_in -				&& memcmp(im->im_name, im_in.im_name, namelen_in) == 0) { -			im->im_id = im_in.im_id; -			wake_up(&idmap->idmap_wq); -		} -		he = idmap_alloc_name(h, im_in.im_name, namelen_in); -		break; -	default: +	namelen_in = strnlen(im.im_name, IDMAP_NAMESZ); +	if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { +		ret = -EINVAL;  		goto out;  	} -	/* If the entry is valid, also copy it to the cache */ -	if (he != NULL) -		idmap_update_entry(he, im_in.im_name, namelen_in, im_in.im_id); -	ret = mlen; +	ret = nfs_idmap_read_message(&im, cons->key, cons->authkey); +	if (ret >= 0) { +		key_set_timeout(cons->key, nfs_idmap_cache_timeout); +		ret = mlen; +	} +  out: -	mutex_unlock(&idmap->idmap_im_lock); +	complete_request_key(idmap->idmap_key_cons, ret); +out_incomplete:  	return ret;  }  static void  idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)  { -	struct idmap_msg *im = msg->data; -	struct idmap *idmap = container_of(im, struct idmap, idmap_im);  - -	if (msg->errno >= 0) -		return; -	mutex_lock(&idmap->idmap_im_lock); -	im->im_status = IDMAP_STATUS_LOOKUPFAIL; -	wake_up(&idmap->idmap_wq); -	mutex_unlock(&idmap->idmap_im_lock); -} - -/*  - * Fowler/Noll/Vo hash - *    http://www.isthe.com/chongo/tech/comp/fnv/ - */ - -#define FNV_P_32 ((unsigned int)0x01000193) /* 16777619 */ -#define FNV_1_32 ((unsigned int)0x811c9dc5) /* 2166136261 */ - -static unsigned int fnvhash32(const void *buf, size_t buflen) -{ -	const unsigned char *p, *end = (const unsigned char *)buf + buflen; -	unsigned int hash = FNV_1_32; - -	for (p = buf; p < end; p++) { -		hash *= FNV_P_32; -		hash ^= (unsigned int)*p; -	} - -	return hash; +	/* Free memory allocated in nfs_idmap_legacy_upcall() */ +	kfree(msg->data); +	kfree(msg);  }  int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) @@ -798,16 +756,16 @@ int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_  	if (nfs_map_string_to_numeric(name, namelen, uid))  		return 0; -	return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); +	return nfs_idmap_lookup_id(name, namelen, "uid", uid, idmap);  } -int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)  {  	struct idmap *idmap = server->nfs_client->cl_idmap; -	if (nfs_map_string_to_numeric(name, namelen, uid)) +	if (nfs_map_string_to_numeric(name, namelen, gid))  		return 0; -	return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); +	return nfs_idmap_lookup_id(name, namelen, "gid", gid, idmap);  }  int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) @@ -816,21 +774,19 @@ int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, s  	int ret = -EINVAL;  	if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) -		ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); +		ret = nfs_idmap_lookup_name(uid, "user", buf, buflen, idmap);  	if (ret < 0)  		ret = nfs_map_numeric_to_string(uid, buf, buflen);  	return ret;  } -int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)  {  	struct idmap *idmap = server->nfs_client->cl_idmap;  	int ret = -EINVAL;  	if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) -		ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); +		ret = nfs_idmap_lookup_name(gid, "group", buf, buflen, idmap);  	if (ret < 0) -		ret = nfs_map_numeric_to_string(uid, buf, buflen); +		ret = nfs_map_numeric_to_string(gid, buf, buflen);  	return ret;  } - -#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f649fba8c38..7bb4d13c1cd 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -39,6 +39,7 @@  #include <linux/slab.h>  #include <linux/compat.h>  #include <linux/freezer.h> +#include <linux/crc32.h>  #include <asm/system.h>  #include <asm/uaccess.h> @@ -51,6 +52,7 @@  #include "fscache.h"  #include "dns_resolve.h"  #include "pnfs.h" +#include "netns.h"  #define NFSDBG_FACILITY		NFSDBG_VFS @@ -388,9 +390,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)  		unlock_new_inode(inode);  	} else  		nfs_refresh_inode(inode, fattr); -	dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n", +	dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",  		inode->i_sb->s_id,  		(long long)NFS_FILEID(inode), +		nfs_display_fhandle_hash(fh),  		atomic_read(&inode->i_count));  out: @@ -401,7 +404,7 @@ out_no_inode:  	goto out;  } -#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE) +#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN)  int  nfs_setattr(struct dentry *dentry, struct iattr *attr) @@ -423,7 +426,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)  	/* Optimization: if the end result is no change, don't RPC */  	attr->ia_valid &= NFS_VALID_ATTRS; -	if ((attr->ia_valid & ~ATTR_FILE) == 0) +	if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0)  		return 0;  	/* Write all dirty data */ @@ -1044,6 +1047,67 @@ struct nfs_fh *nfs_alloc_fhandle(void)  	return fh;  } +#ifdef NFS_DEBUG +/* + * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle + *                             in the same way that wireshark does + * + * @fh: file handle + * + * For debugging only. + */ +u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) +{ +	/* wireshark uses 32-bit AUTODIN crc and does a bitwise +	 * not on the result */ +	return ~crc32(0xFFFFFFFF, &fh->data[0], fh->size); +} + +/* + * _nfs_display_fhandle - display an NFS file handle on the console + * + * @fh: file handle to display + * @caption: display caption + * + * For debugging only. + */ +void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) +{ +	unsigned short i; + +	if (fh == NULL || fh->size == 0) { +		printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh); +		return; +	} + +	printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n", +	       caption, fh, fh->size, _nfs_display_fhandle_hash(fh)); +	for (i = 0; i < fh->size; i += 16) { +		__be32 *pos = (__be32 *)&fh->data[i]; + +		switch ((fh->size - i - 1) >> 2) { +		case 0: +			printk(KERN_DEFAULT " %08x\n", +				be32_to_cpup(pos)); +			break; +		case 1: +			printk(KERN_DEFAULT " %08x %08x\n", +				be32_to_cpup(pos), be32_to_cpup(pos + 1)); +			break; +		case 2: +			printk(KERN_DEFAULT " %08x %08x %08x\n", +				be32_to_cpup(pos), be32_to_cpup(pos + 1), +				be32_to_cpup(pos + 2)); +			break; +		default: +			printk(KERN_DEFAULT " %08x %08x %08x %08x\n", +				be32_to_cpup(pos), be32_to_cpup(pos + 1), +				be32_to_cpup(pos + 2), be32_to_cpup(pos + 3)); +		} +	} +} +#endif +  /**   * nfs_inode_attrs_need_update - check if the inode attributes need updating   * @inode - pointer to inode @@ -1211,8 +1275,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  	unsigned long now = jiffies;  	unsigned long save_cache_validity; -	dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", +	dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n",  			__func__, inode->i_sb->s_id, inode->i_ino, +			nfs_display_fhandle_hash(NFS_FH(inode)),  			atomic_read(&inode->i_count), fattr->valid);  	if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) @@ -1406,7 +1471,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)  	/*  	 * Big trouble! The inode has become a different object.  	 */ -	printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n", +	printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n",  			__func__, inode->i_ino, inode->i_mode, fattr->mode);   out_err:  	/* @@ -1495,7 +1560,7 @@ static void init_once(void *foo)  	INIT_LIST_HEAD(&nfsi->open_files);  	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);  	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); -	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); +	INIT_LIST_HEAD(&nfsi->commit_list);  	nfsi->npages = 0;  	nfsi->ncommit = 0;  	atomic_set(&nfsi->silly_count, 1); @@ -1552,6 +1617,28 @@ static void nfsiod_stop(void)  	destroy_workqueue(wq);  } +int nfs_net_id; +EXPORT_SYMBOL_GPL(nfs_net_id); + +static int nfs_net_init(struct net *net) +{ +	nfs_clients_init(net); +	return nfs_dns_resolver_cache_init(net); +} + +static void nfs_net_exit(struct net *net) +{ +	nfs_dns_resolver_cache_destroy(net); +	nfs_cleanup_cb_ident_idr(net); +} + +static struct pernet_operations nfs_net_ops = { +	.init = nfs_net_init, +	.exit = nfs_net_exit, +	.id   = &nfs_net_id, +	.size = sizeof(struct nfs_net), +}; +  /*   * Initialize NFS   */ @@ -1561,10 +1648,14 @@ static int __init init_nfs_fs(void)  	err = nfs_idmap_init();  	if (err < 0) -		goto out9; +		goto out10;  	err = nfs_dns_resolver_init();  	if (err < 0) +		goto out9; + +	err = register_pernet_subsys(&nfs_net_ops); +	if (err < 0)  		goto out8;  	err = nfs_fscache_register(); @@ -1600,14 +1691,14 @@ static int __init init_nfs_fs(void)  		goto out0;  #ifdef CONFIG_PROC_FS -	rpc_proc_register(&nfs_rpcstat); +	rpc_proc_register(&init_net, &nfs_rpcstat);  #endif  	if ((err = register_nfs_fs()) != 0)  		goto out;  	return 0;  out:  #ifdef CONFIG_PROC_FS -	rpc_proc_unregister("nfs"); +	rpc_proc_unregister(&init_net, "nfs");  #endif  	nfs_destroy_directcache();  out0: @@ -1625,10 +1716,12 @@ out5:  out6:  	nfs_fscache_unregister();  out7: -	nfs_dns_resolver_destroy(); +	unregister_pernet_subsys(&nfs_net_ops);  out8: -	nfs_idmap_quit(); +	nfs_dns_resolver_destroy();  out9: +	nfs_idmap_quit(); +out10:  	return err;  } @@ -1640,12 +1733,12 @@ static void __exit exit_nfs_fs(void)  	nfs_destroy_inodecache();  	nfs_destroy_nfspagecache();  	nfs_fscache_unregister(); +	unregister_pernet_subsys(&nfs_net_ops);  	nfs_dns_resolver_destroy();  	nfs_idmap_quit();  #ifdef CONFIG_PROC_FS -	rpc_proc_unregister("nfs"); +	rpc_proc_unregister(&init_net, "nfs");  #endif -	nfs_cleanup_cb_ident_idr();  	unregister_nfs_fs();  	nfs_fs_proc_exit();  	nfsiod_stop(); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 8102db9b926..2476dc69365 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -123,6 +123,7 @@ struct nfs_parsed_mount_data {  	} nfs_server;  	struct security_mnt_opts lsm_opts; +	struct net		*net;  };  /* mount_clnt.c */ @@ -137,20 +138,22 @@ struct nfs_mount_request {  	int			noresvport;  	unsigned int		*auth_flav_len;  	rpc_authflavor_t	*auth_flavs; +	struct net		*net;  };  extern int nfs_mount(struct nfs_mount_request *info);  extern void nfs_umount(const struct nfs_mount_request *info);  /* client.c */ -extern struct rpc_program nfs_program; +extern const struct rpc_program nfs_program; +extern void nfs_clients_init(struct net *net); -extern void nfs_cleanup_cb_ident_idr(void); +extern void nfs_cleanup_cb_ident_idr(struct net *);  extern void nfs_put_client(struct nfs_client *); -extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *); -extern struct nfs_client *nfs4_find_client_ident(int); +extern struct nfs_client *nfs4_find_client_ident(struct net *, int);  extern struct nfs_client * -nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *); +nfs4_find_client_sessionid(struct net *, const struct sockaddr *, +				struct nfs4_sessionid *);  extern struct nfs_server *nfs_create_server(  					const struct nfs_parsed_mount_data *,  					struct nfs_fh *); @@ -329,6 +332,8 @@ void nfs_retry_commit(struct list_head *page_list,  void nfs_commit_clear_lock(struct nfs_inode *nfsi);  void nfs_commitdata_release(void *data);  void nfs_commit_release_pages(struct nfs_write_data *data); +void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head); +void nfs_request_remove_commit_list(struct nfs_page *req);  #ifdef CONFIG_MIGRATION  extern int nfs_migrate_page(struct address_space *, diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index d4c2d6b7507..8e65c7f1f87 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -16,7 +16,7 @@  #include <linux/nfs_fs.h>  #include "internal.h" -#ifdef RPC_DEBUG +#ifdef NFS_DEBUG  # define NFSDBG_FACILITY	NFSDBG_MOUNT  #endif @@ -67,7 +67,7 @@ enum {  	MOUNTPROC3_EXPORT	= 5,  }; -static struct rpc_program	mnt_program; +static const struct rpc_program mnt_program;  /*   * Defined by OpenGroup XNFS Version 3W, chapter 8 @@ -153,7 +153,7 @@ int nfs_mount(struct nfs_mount_request *info)  		.rpc_resp	= &result,  	};  	struct rpc_create_args args = { -		.net		= &init_net, +		.net		= info->net,  		.protocol	= info->protocol,  		.address	= info->sap,  		.addrsize	= info->salen, @@ -225,7 +225,7 @@ void nfs_umount(const struct nfs_mount_request *info)  		.to_retries = 2,  	};  	struct rpc_create_args args = { -		.net		= &init_net, +		.net		= info->net,  		.protocol	= IPPROTO_UDP,  		.address	= info->sap,  		.addrsize	= info->salen, @@ -488,19 +488,19 @@ static struct rpc_procinfo mnt3_procedures[] = {  }; -static struct rpc_version mnt_version1 = { +static const struct rpc_version mnt_version1 = {  	.number		= 1,  	.nrprocs	= ARRAY_SIZE(mnt_procedures),  	.procs		= mnt_procedures,  }; -static struct rpc_version mnt_version3 = { +static const struct rpc_version mnt_version3 = {  	.number		= 3,  	.nrprocs	= ARRAY_SIZE(mnt3_procedures),  	.procs		= mnt3_procedures,  }; -static struct rpc_version *mnt_version[] = { +static const struct rpc_version *mnt_version[] = {  	NULL,  	&mnt_version1,  	NULL, @@ -509,7 +509,7 @@ static struct rpc_version *mnt_version[] = {  static struct rpc_stat mnt_stats; -static struct rpc_program mnt_program = { +static const struct rpc_program mnt_program = {  	.name		= "mount",  	.number		= NFS_MNT_PROGRAM,  	.nrvers		= ARRAY_SIZE(mnt_version), diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 8102391bb37..1807866bb3a 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -276,7 +276,10 @@ out:  	nfs_free_fattr(fattr);  	nfs_free_fhandle(fh);  out_nofree: -	dprintk("<-- nfs_follow_mountpoint() = %p\n", mnt); +	if (IS_ERR(mnt)) +		dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt)); +	else +		dprintk("<-- %s() = %p\n", __func__, mnt);  	return mnt;  } diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h new file mode 100644 index 00000000000..aa14ec303e9 --- /dev/null +++ b/fs/nfs/netns.h @@ -0,0 +1,27 @@ +#ifndef __NFS_NETNS_H__ +#define __NFS_NETNS_H__ + +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +struct bl_dev_msg { +	int32_t status; +	uint32_t major, minor; +}; + +struct nfs_net { +	struct cache_detail *nfs_dns_resolve; +	struct rpc_pipe *bl_device_pipe; +	struct bl_dev_msg bl_mount_reply; +	wait_queue_head_t bl_wq; +	struct list_head nfs_client_list; +	struct list_head nfs_volume_list; +#ifdef CONFIG_NFS_V4 +	struct idr cb_ident_idr; /* Protected by nfs_client_lock */ +#endif +	spinlock_t nfs_client_lock; +}; + +extern int nfs_net_id; + +#endif diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 792cb13a430..1f56000fabb 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -1150,7 +1150,7 @@ struct rpc_procinfo	nfs_procedures[] = {  	PROC(STATFS,	fhandle,	statfsres,	0),  }; -struct rpc_version		nfs_version2 = { +const struct rpc_version nfs_version2 = {  	.number			= 2,  	.nrprocs		= ARRAY_SIZE(nfs_procedures),  	.procs			= nfs_procedures diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 7ef23979896..e4498dc351a 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -192,7 +192,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)  		.pages = pages,  	};  	struct nfs3_getaclres res = { -		0 +		NULL,  	};  	struct rpc_message msg = {  		.rpc_argp	= &args, diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 91943953a37..5242eae6711 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -428,6 +428,11 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)  	msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE];  } +static void nfs3_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ +	rpc_call_start(task); +} +  static int  nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir)  { @@ -445,6 +450,11 @@ nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir)  	msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME];  } +static void nfs3_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ +	rpc_call_start(task); +} +  static int  nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,  		      struct inode *new_dir) @@ -814,6 +824,11 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message  	msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];  } +static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +{ +	rpc_call_start(task); +} +  static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)  {  	if (nfs3_async_handle_jukebox(task, data->inode)) @@ -828,6 +843,11 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag  	msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];  } +static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) +{ +	rpc_call_start(task); +} +  static int nfs3_commit_done(struct rpc_task *task, struct nfs_write_data *data)  {  	if (nfs3_async_handle_jukebox(task, data->inode)) @@ -864,9 +884,11 @@ const struct nfs_rpc_ops nfs_v3_clientops = {  	.create		= nfs3_proc_create,  	.remove		= nfs3_proc_remove,  	.unlink_setup	= nfs3_proc_unlink_setup, +	.unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,  	.unlink_done	= nfs3_proc_unlink_done,  	.rename		= nfs3_proc_rename,  	.rename_setup	= nfs3_proc_rename_setup, +	.rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,  	.rename_done	= nfs3_proc_rename_done,  	.link		= nfs3_proc_link,  	.symlink	= nfs3_proc_symlink, @@ -879,8 +901,10 @@ const struct nfs_rpc_ops nfs_v3_clientops = {  	.pathconf	= nfs3_proc_pathconf,  	.decode_dirent	= nfs3_decode_dirent,  	.read_setup	= nfs3_proc_read_setup, +	.read_rpc_prepare = nfs3_proc_read_rpc_prepare,  	.read_done	= nfs3_read_done,  	.write_setup	= nfs3_proc_write_setup, +	.write_rpc_prepare = nfs3_proc_write_rpc_prepare,  	.write_done	= nfs3_write_done,  	.commit_setup	= nfs3_proc_commit_setup,  	.commit_done	= nfs3_commit_done, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 183c6b123d0..a77cc9a3ce5 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -2461,7 +2461,7 @@ struct rpc_procinfo	nfs3_procedures[] = {  	PROC(COMMIT,		commit,		commit,		5),  }; -struct rpc_version		nfs_version3 = { +const struct rpc_version nfs_version3 = {  	.number			= 3,  	.nrprocs		= ARRAY_SIZE(nfs3_procedures),  	.procs			= nfs3_procedures @@ -2489,7 +2489,7 @@ static struct rpc_procinfo	nfs3_acl_procedures[] = {  	},  }; -struct rpc_version		nfsacl_version3 = { +const struct rpc_version nfsacl_version3 = {  	.number			= 3,  	.nrprocs		= sizeof(nfs3_acl_procedures)/  				  sizeof(nfs3_acl_procedures[0]), diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4d7d0aedc10..97ecc863dd7 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -20,7 +20,6 @@ enum nfs4_client_state {  	NFS4CLNT_RECLAIM_REBOOT,  	NFS4CLNT_RECLAIM_NOGRACE,  	NFS4CLNT_DELEGRETURN, -	NFS4CLNT_LAYOUTRECALL,  	NFS4CLNT_SESSION_RESET,  	NFS4CLNT_RECALL_SLOT,  	NFS4CLNT_LEASE_CONFIRM, @@ -44,7 +43,7 @@ struct nfs4_minor_version_ops {  			struct nfs4_sequence_args *args,  			struct nfs4_sequence_res *res,  			int cache_reply); -	int	(*validate_stateid)(struct nfs_delegation *, +	bool	(*match_stateid)(const nfs4_stateid *,  			const nfs4_stateid *);  	int	(*find_root_sec)(struct nfs_server *, struct nfs_fh *,  			struct nfs_fsinfo *); @@ -53,26 +52,25 @@ struct nfs4_minor_version_ops {  	const struct nfs4_state_maintenance_ops *state_renewal_ops;  }; -/* - * struct rpc_sequence ensures that RPC calls are sent in the exact - * order that they appear on the list. - */ -struct rpc_sequence { -	struct rpc_wait_queue	wait;	/* RPC call delay queue */ -	spinlock_t lock;		/* Protects the list */ -	struct list_head list;		/* Defines sequence of RPC calls */ +struct nfs_unique_id { +	struct rb_node rb_node; +	__u64 id;  };  #define NFS_SEQID_CONFIRMED 1  struct nfs_seqid_counter { -	struct rpc_sequence *sequence; +	int owner_id;  	int flags;  	u32 counter; +	spinlock_t lock;		/* Protects the list */ +	struct list_head list;		/* Defines sequence of RPC calls */ +	struct rpc_wait_queue	wait;	/* RPC call delay queue */  };  struct nfs_seqid {  	struct nfs_seqid_counter *sequence;  	struct list_head list; +	struct rpc_task *task;  };  static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status) @@ -81,18 +79,12 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status  		seqid->flags |= NFS_SEQID_CONFIRMED;  } -struct nfs_unique_id { -	struct rb_node rb_node; -	__u64 id; -}; -  /*   * NFS4 state_owners and lock_owners are simply labels for ordered   * sequences of RPC calls. Their sole purpose is to provide once-only   * semantics by allowing the server to identify replayed requests.   */  struct nfs4_state_owner { -	struct nfs_unique_id so_owner_id;  	struct nfs_server    *so_server;  	struct list_head     so_lru;  	unsigned long        so_expires; @@ -105,7 +97,6 @@ struct nfs4_state_owner {  	unsigned long	     so_flags;  	struct list_head     so_states;  	struct nfs_seqid_counter so_seqid; -	struct rpc_sequence  so_sequence;  };  enum { @@ -146,8 +137,6 @@ struct nfs4_lock_state {  #define NFS_LOCK_INITIALIZED 1  	int			ls_flags;  	struct nfs_seqid_counter	ls_seqid; -	struct rpc_sequence	ls_sequence; -	struct nfs_unique_id	ls_id;  	nfs4_stateid		ls_stateid;  	atomic_t		ls_count;  	struct nfs4_lock_owner	ls_owner; @@ -193,6 +182,7 @@ struct nfs4_exception {  	long timeout;  	int retry;  	struct nfs4_state *state; +	struct inode *inode;  };  struct nfs4_state_recovery_ops { @@ -224,7 +214,7 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, boo  extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);  extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,  		struct nfs4_fs_locations *fs_locations, struct page *page); -extern void nfs4_release_lockowner(const struct nfs4_lock_state *); +extern int nfs4_release_lockowner(struct nfs4_lock_state *);  extern const struct xattr_handler *nfs4_xattr_handlers[];  #if defined(CONFIG_NFS_V4_1) @@ -233,12 +223,13 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser  	return server->nfs_client->cl_session;  } +extern bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy);  extern int nfs4_setup_sequence(const struct nfs_server *server,  		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, -		int cache_reply, struct rpc_task *task); +		struct rpc_task *task);  extern int nfs41_setup_sequence(struct nfs4_session *session,  		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, -		int cache_reply, struct rpc_task *task); +		struct rpc_task *task);  extern void nfs4_destroy_session(struct nfs4_session *session);  extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);  extern int nfs4_proc_create_session(struct nfs_client *); @@ -269,7 +260,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser  static inline int nfs4_setup_sequence(const struct nfs_server *server,  		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, -		int cache_reply, struct rpc_task *task) +		struct rpc_task *task)  {  	return 0;  } @@ -319,7 +310,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)  }  #endif /* CONFIG_NFS_V4_1 */ -extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); +extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t);  extern void nfs4_put_state_owner(struct nfs4_state_owner *);  extern void nfs4_purge_state_owners(struct nfs_server *);  extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); @@ -327,6 +318,8 @@ extern void nfs4_put_open_state(struct nfs4_state *);  extern void nfs4_close_state(struct nfs4_state *, fmode_t);  extern void nfs4_close_sync(struct nfs4_state *, fmode_t);  extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); +extern void nfs_inode_find_state_and_recover(struct inode *inode, +		const nfs4_stateid *stateid);  extern void nfs4_schedule_lease_recovery(struct nfs_client *);  extern void nfs4_schedule_state_manager(struct nfs_client *);  extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); @@ -337,7 +330,8 @@ extern void nfs41_handle_server_scope(struct nfs_client *,  				      struct server_scope **);  extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);  extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); +extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, +		fmode_t, fl_owner_t, pid_t);  extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);  extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -346,6 +340,8 @@ extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);  extern void nfs_release_seqid(struct nfs_seqid *seqid);  extern void nfs_free_seqid(struct nfs_seqid *seqid); +extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); +  extern const nfs4_stateid zero_stateid;  /* nfs4xdr.c */ @@ -357,6 +353,16 @@ struct nfs4_mount_data;  extern struct svc_version nfs4_callback_version1;  extern struct svc_version nfs4_callback_version4; +static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src) +{ +	memcpy(dst, src, sizeof(*dst)); +} + +static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src) +{ +	return memcmp(dst, src, sizeof(*dst)) == 0; +} +  #else  #define nfs4_close_state(a, b) do { } while (0) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 71ec08617e2..634c0bcb4fd 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -33,7 +33,10 @@  #include <linux/nfs_page.h>  #include <linux/module.h> +#include <linux/sunrpc/metrics.h> +  #include "internal.h" +#include "delegation.h"  #include "nfs4filelayout.h"  #define NFSDBG_FACILITY         NFSDBG_PNFS_LD @@ -84,12 +87,27 @@ static int filelayout_async_handle_error(struct rpc_task *task,  					 struct nfs_client *clp,  					 int *reset)  { +	struct nfs_server *mds_server = NFS_SERVER(state->inode); +	struct nfs_client *mds_client = mds_server->nfs_client; +  	if (task->tk_status >= 0)  		return 0; -  	*reset = 0;  	switch (task->tk_status) { +	/* MDS state errors */ +	case -NFS4ERR_DELEG_REVOKED: +	case -NFS4ERR_ADMIN_REVOKED: +	case -NFS4ERR_BAD_STATEID: +		nfs_remove_bad_delegation(state->inode); +	case -NFS4ERR_OPENMODE: +		nfs4_schedule_stateid_recovery(mds_server, state); +		goto wait_on_recovery; +	case -NFS4ERR_EXPIRED: +		nfs4_schedule_stateid_recovery(mds_server, state); +		nfs4_schedule_lease_recovery(mds_client); +		goto wait_on_recovery; +	/* DS session errors */  	case -NFS4ERR_BADSESSION:  	case -NFS4ERR_BADSLOT:  	case -NFS4ERR_BAD_HIGH_SLOT: @@ -115,8 +133,14 @@ static int filelayout_async_handle_error(struct rpc_task *task,  		*reset = 1;  		break;  	} +out:  	task->tk_status = 0;  	return -EAGAIN; +wait_on_recovery: +	rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); +	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) +		rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task); +	goto out;  }  /* NFS_PROTO call done callback routines */ @@ -173,7 +197,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)  	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,  				&rdata->args.seq_args, &rdata->res.seq_res, -				0, task)) +				task))  		return;  	rpc_call_start(task); @@ -189,10 +213,18 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)  	rdata->mds_ops->rpc_call_done(task, data);  } +static void filelayout_read_count_stats(struct rpc_task *task, void *data) +{ +	struct nfs_read_data *rdata = (struct nfs_read_data *)data; + +	rpc_count_iostats(task, NFS_SERVER(rdata->inode)->client->cl_metrics); +} +  static void filelayout_read_release(void *data)  {  	struct nfs_read_data *rdata = (struct nfs_read_data *)data; +	put_lseg(rdata->lseg);  	rdata->mds_ops->rpc_release(data);  } @@ -254,7 +286,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)  	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,  				&wdata->args.seq_args, &wdata->res.seq_res, -				0, task)) +				task))  		return;  	rpc_call_start(task); @@ -268,10 +300,18 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)  	wdata->mds_ops->rpc_call_done(task, data);  } +static void filelayout_write_count_stats(struct rpc_task *task, void *data) +{ +	struct nfs_write_data *wdata = (struct nfs_write_data *)data; + +	rpc_count_iostats(task, NFS_SERVER(wdata->inode)->client->cl_metrics); +} +  static void filelayout_write_release(void *data)  {  	struct nfs_write_data *wdata = (struct nfs_write_data *)data; +	put_lseg(wdata->lseg);  	wdata->mds_ops->rpc_release(data);  } @@ -282,24 +322,28 @@ static void filelayout_commit_release(void *data)  	nfs_commit_release_pages(wdata);  	if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding))  		nfs_commit_clear_lock(NFS_I(wdata->inode)); +	put_lseg(wdata->lseg);  	nfs_commitdata_release(wdata);  } -struct rpc_call_ops filelayout_read_call_ops = { +static const struct rpc_call_ops filelayout_read_call_ops = {  	.rpc_call_prepare = filelayout_read_prepare,  	.rpc_call_done = filelayout_read_call_done, +	.rpc_count_stats = filelayout_read_count_stats,  	.rpc_release = filelayout_read_release,  }; -struct rpc_call_ops filelayout_write_call_ops = { +static const struct rpc_call_ops filelayout_write_call_ops = {  	.rpc_call_prepare = filelayout_write_prepare,  	.rpc_call_done = filelayout_write_call_done, +	.rpc_count_stats = filelayout_write_count_stats,  	.rpc_release = filelayout_write_release,  }; -struct rpc_call_ops filelayout_commit_call_ops = { +static const struct rpc_call_ops filelayout_commit_call_ops = {  	.rpc_call_prepare = filelayout_write_prepare,  	.rpc_call_done = filelayout_write_call_done, +	.rpc_count_stats = filelayout_write_count_stats,  	.rpc_release = filelayout_commit_release,  }; @@ -367,7 +411,8 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)  	idx = nfs4_fl_calc_ds_index(lseg, j);  	ds = nfs4_fl_prepare_ds(lseg, idx);  	if (!ds) { -		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); +		printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n", +			__func__);  		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);  		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);  		return PNFS_NOT_ATTEMPTED; @@ -575,7 +620,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,  			goto out_err_free;  		fl->fh_array[i]->size = be32_to_cpup(p++);  		if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { -			printk(KERN_ERR "Too big fh %d received %d\n", +			printk(KERN_ERR "NFS: Too big fh %d received %d\n",  			       i, fl->fh_array[i]->size);  			goto out_err_free;  		} @@ -640,14 +685,16 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,  		int size = (fl->stripe_type == STRIPE_SPARSE) ?  			fl->dsaddr->ds_num : fl->dsaddr->stripe_count; -		fl->commit_buckets = kcalloc(size, sizeof(struct list_head), gfp_flags); +		fl->commit_buckets = kcalloc(size, sizeof(struct nfs4_fl_commit_bucket), gfp_flags);  		if (!fl->commit_buckets) {  			filelayout_free_lseg(&fl->generic_hdr);  			return NULL;  		}  		fl->number_of_buckets = size; -		for (i = 0; i < size; i++) -			INIT_LIST_HEAD(&fl->commit_buckets[i]); +		for (i = 0; i < size; i++) { +			INIT_LIST_HEAD(&fl->commit_buckets[i].written); +			INIT_LIST_HEAD(&fl->commit_buckets[i].committing); +		}  	}  	return &fl->generic_hdr;  } @@ -679,7 +726,7 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,  	return (p_stripe == r_stripe);  } -void +static void  filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,  			struct nfs_page *req)  { @@ -696,7 +743,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,  		nfs_pageio_reset_read_mds(pgio);  } -void +static void  filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,  			 struct nfs_page *req)  { @@ -725,11 +772,6 @@ static const struct nfs_pageio_ops filelayout_pg_write_ops = {  	.pg_doio = pnfs_generic_pg_writepages,  }; -static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) -{ -	return !FILELAYOUT_LSEG(lseg)->commit_through_mds; -} -  static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)  {  	if (fl->stripe_type == STRIPE_SPARSE) @@ -738,13 +780,49 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)  		return j;  } -struct list_head *filelayout_choose_commit_list(struct nfs_page *req) +/* The generic layer is about to remove the req from the commit list. + * If this will make the bucket empty, it will need to put the lseg reference. + */ +static void +filelayout_clear_request_commit(struct nfs_page *req) +{ +	struct pnfs_layout_segment *freeme = NULL; +	struct inode *inode = req->wb_context->dentry->d_inode; + +	spin_lock(&inode->i_lock); +	if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) +		goto out; +	if (list_is_singular(&req->wb_list)) { +		struct inode *inode = req->wb_context->dentry->d_inode; +		struct pnfs_layout_segment *lseg; + +		/* From here we can find the bucket, but for the moment, +		 * since there is only one relevant lseg... +		 */ +		list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { +			if (lseg->pls_range.iomode == IOMODE_RW) { +				freeme = lseg; +				break; +			} +		} +	} +out: +	nfs_request_remove_commit_list(req); +	spin_unlock(&inode->i_lock); +	put_lseg(freeme); +} + +static struct list_head * +filelayout_choose_commit_list(struct nfs_page *req, +			      struct pnfs_layout_segment *lseg)  { -	struct pnfs_layout_segment *lseg = req->wb_commit_lseg;  	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);  	u32 i, j;  	struct list_head *list; +	if (fl->commit_through_mds) +		return &NFS_I(req->wb_context->dentry->d_inode)->commit_list; +  	/* Note that we are calling nfs4_fl_calc_j_index on each page  	 * that ends up being committed to a data server.  An attractive  	 * alternative is to add a field to nfs_write_data and nfs_page @@ -754,14 +832,30 @@ struct list_head *filelayout_choose_commit_list(struct nfs_page *req)  	j = nfs4_fl_calc_j_index(lseg,  				 (loff_t)req->wb_index << PAGE_CACHE_SHIFT);  	i = select_bucket_index(fl, j); -	list = &fl->commit_buckets[i]; +	list = &fl->commit_buckets[i].written;  	if (list_empty(list)) { -		/* Non-empty buckets hold a reference on the lseg */ +		/* Non-empty buckets hold a reference on the lseg.  That ref +		 * is normally transferred to the COMMIT call and released +		 * there.  It could also be released if the last req is pulled +		 * off due to a rewrite, in which case it will be done in +		 * filelayout_remove_commit_req +		 */  		get_lseg(lseg);  	} +	set_bit(PG_COMMIT_TO_DS, &req->wb_flags);  	return list;  } +static void +filelayout_mark_request_commit(struct nfs_page *req, +		struct pnfs_layout_segment *lseg) +{ +	struct list_head *list; + +	list = filelayout_choose_commit_list(req, lseg); +	nfs_request_add_commit_list(req, list); +} +  static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)  {  	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); @@ -797,11 +891,12 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how)  	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);  	ds = nfs4_fl_prepare_ds(lseg, idx);  	if (!ds) { -		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); +		printk(KERN_ERR "NFS: %s: prepare_ds failed, use MDS\n", +			__func__);  		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);  		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);  		prepare_to_resend_writes(data); -		data->mds_ops->rpc_release(data); +		filelayout_commit_release(data);  		return -EAGAIN;  	}  	dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how); @@ -817,24 +912,87 @@ static int filelayout_initiate_commit(struct nfs_write_data *data, int how)  /*   * This is only useful while we are using whole file layouts.   */ -static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode) +static struct pnfs_layout_segment * +find_only_write_lseg_locked(struct inode *inode)  { -	struct pnfs_layout_segment *lseg, *rv = NULL; +	struct pnfs_layout_segment *lseg; -	spin_lock(&inode->i_lock);  	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)  		if (lseg->pls_range.iomode == IOMODE_RW) -			rv = get_lseg(lseg); +			return lseg; +	return NULL; +} + +static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode) +{ +	struct pnfs_layout_segment *rv; + +	spin_lock(&inode->i_lock); +	rv = find_only_write_lseg_locked(inode); +	if (rv) +		get_lseg(rv);  	spin_unlock(&inode->i_lock);  	return rv;  } -static int alloc_ds_commits(struct inode *inode, struct list_head *list) +static int +filelayout_scan_ds_commit_list(struct nfs4_fl_commit_bucket *bucket, int max, +		spinlock_t *lock) +{ +	struct list_head *src = &bucket->written; +	struct list_head *dst = &bucket->committing; +	struct nfs_page *req, *tmp; +	int ret = 0; + +	list_for_each_entry_safe(req, tmp, src, wb_list) { +		if (!nfs_lock_request(req)) +			continue; +		if (cond_resched_lock(lock)) +			list_safe_reset_next(req, tmp, wb_list); +		nfs_request_remove_commit_list(req); +		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); +		nfs_list_add_request(req, dst); +		ret++; +		if (ret == max) +			break; +	} +	return ret; +} + +/* Move reqs from written to committing lists, returning count of number moved. + * Note called with i_lock held. + */ +static int filelayout_scan_commit_lists(struct inode *inode, int max, +		spinlock_t *lock) +{ +	struct pnfs_layout_segment *lseg; +	struct nfs4_filelayout_segment *fl; +	int i, rv = 0, cnt; + +	lseg = find_only_write_lseg_locked(inode); +	if (!lseg) +		goto out_done; +	fl = FILELAYOUT_LSEG(lseg); +	if (fl->commit_through_mds) +		goto out_done; +	for (i = 0; i < fl->number_of_buckets && max != 0; i++) { +		cnt = filelayout_scan_ds_commit_list(&fl->commit_buckets[i], +				max, lock); +		max -= cnt; +		rv += cnt; +	} +out_done: +	return rv; +} + +static unsigned int +alloc_ds_commits(struct inode *inode, struct list_head *list)  {  	struct pnfs_layout_segment *lseg;  	struct nfs4_filelayout_segment *fl;  	struct nfs_write_data *data;  	int i, j; +	unsigned int nreq = 0;  	/* Won't need this when non-whole file layout segments are supported  	 * instead we will use a pnfs_layout_hdr structure */ @@ -843,28 +1001,27 @@ static int alloc_ds_commits(struct inode *inode, struct list_head *list)  		return 0;  	fl = FILELAYOUT_LSEG(lseg);  	for (i = 0; i < fl->number_of_buckets; i++) { -		if (list_empty(&fl->commit_buckets[i])) +		if (list_empty(&fl->commit_buckets[i].committing))  			continue;  		data = nfs_commitdata_alloc();  		if (!data) -			goto out_bad; +			break;  		data->ds_commit_index = i;  		data->lseg = lseg;  		list_add(&data->pages, list); +		nreq++;  	} -	put_lseg(lseg); -	return 0; -out_bad: +	/* Clean up on error */  	for (j = i; j < fl->number_of_buckets; j++) { -		if (list_empty(&fl->commit_buckets[i])) +		if (list_empty(&fl->commit_buckets[i].committing))  			continue; -		nfs_retry_commit(&fl->commit_buckets[i], lseg); +		nfs_retry_commit(&fl->commit_buckets[i].committing, lseg);  		put_lseg(lseg);  /* associated with emptying bucket */  	}  	put_lseg(lseg);  	/* Caller will clean up entries put on list */ -	return -ENOMEM; +	return nreq;  }  /* This follows nfs_commit_list pretty closely */ @@ -874,40 +1031,40 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,  {  	struct nfs_write_data	*data, *tmp;  	LIST_HEAD(list); +	unsigned int nreq = 0;  	if (!list_empty(mds_pages)) {  		data = nfs_commitdata_alloc(); -		if (!data) -			goto out_bad; -		data->lseg = NULL; -		list_add(&data->pages, &list); +		if (data != NULL) { +			data->lseg = NULL; +			list_add(&data->pages, &list); +			nreq++; +		} else +			nfs_retry_commit(mds_pages, NULL);  	} -	if (alloc_ds_commits(inode, &list)) -		goto out_bad; +	nreq += alloc_ds_commits(inode, &list); + +	if (nreq == 0) { +		nfs_commit_clear_lock(NFS_I(inode)); +		goto out; +	} + +	atomic_add(nreq, &NFS_I(inode)->commits_outstanding);  	list_for_each_entry_safe(data, tmp, &list, pages) {  		list_del_init(&data->pages); -		atomic_inc(&NFS_I(inode)->commits_outstanding);  		if (!data->lseg) {  			nfs_init_commit(data, mds_pages, NULL);  			nfs_initiate_commit(data, NFS_CLIENT(inode),  					    data->mds_ops, how);  		} else { -			nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg); +			nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index].committing, data->lseg);  			filelayout_initiate_commit(data, how);  		}  	} -	return 0; - out_bad: -	list_for_each_entry_safe(data, tmp, &list, pages) { -		nfs_retry_commit(&data->pages, data->lseg); -		list_del_init(&data->pages); -		nfs_commit_free(data); -	} -	nfs_retry_commit(mds_pages, NULL); -	nfs_commit_clear_lock(NFS_I(inode)); -	return -ENOMEM; +out: +	return PNFS_ATTEMPTED;  }  static void @@ -924,8 +1081,9 @@ static struct pnfs_layoutdriver_type filelayout_type = {  	.free_lseg		= filelayout_free_lseg,  	.pg_read_ops		= &filelayout_pg_read_ops,  	.pg_write_ops		= &filelayout_pg_write_ops, -	.mark_pnfs_commit	= filelayout_mark_pnfs_commit, -	.choose_commit_list	= filelayout_choose_commit_list, +	.mark_request_commit	= filelayout_mark_request_commit, +	.clear_request_commit	= filelayout_clear_request_commit, +	.scan_commit_lists	= filelayout_scan_commit_lists,  	.commit_pagelist	= filelayout_commit_pagelist,  	.read_pagelist		= filelayout_read_pagelist,  	.write_pagelist		= filelayout_write_pagelist, diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 2e42284253f..21190bb1f5e 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -74,6 +74,11 @@ struct nfs4_file_layout_dsaddr {  	struct nfs4_pnfs_ds		*ds_list[1];  }; +struct nfs4_fl_commit_bucket { +	struct list_head written; +	struct list_head committing; +}; +  struct nfs4_filelayout_segment {  	struct pnfs_layout_segment generic_hdr;  	u32 stripe_type; @@ -84,7 +89,7 @@ struct nfs4_filelayout_segment {  	struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */  	unsigned int num_fh;  	struct nfs_fh **fh_array; -	struct list_head *commit_buckets; /* Sort commits to ds */ +	struct nfs4_fl_commit_bucket *commit_buckets; /* Sort commits to ds */  	int number_of_buckets;  }; diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 8ae91908f5a..a866bbd2890 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -45,7 +45,7 @@   *   - incremented when a device id maps a data server already in the cache.   *   - decremented when deviceid is removed from the cache.   */ -DEFINE_SPINLOCK(nfs4_ds_cache_lock); +static DEFINE_SPINLOCK(nfs4_ds_cache_lock);  static LIST_HEAD(nfs4_data_server_cache);  /* Debug routines */ @@ -108,58 +108,40 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)  	return false;  } -/* - * Lookup DS by addresses.  The first matching address returns true. - * nfs4_ds_cache_lock is held - */ -static struct nfs4_pnfs_ds * -_data_server_lookup_locked(struct list_head *dsaddrs) +static bool +_same_data_server_addrs_locked(const struct list_head *dsaddrs1, +			       const struct list_head *dsaddrs2)  { -	struct nfs4_pnfs_ds *ds;  	struct nfs4_pnfs_ds_addr *da1, *da2; -	list_for_each_entry(da1, dsaddrs, da_node) { -		list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { -			list_for_each_entry(da2, &ds->ds_addrs, da_node) { -				if (same_sockaddr( -					(struct sockaddr *)&da1->da_addr, -					(struct sockaddr *)&da2->da_addr)) -					return ds; -			} -		} +	/* step through both lists, comparing as we go */ +	for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), +	     da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); +	     da1 != NULL && da2 != NULL; +	     da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), +	     da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { +		if (!same_sockaddr((struct sockaddr *)&da1->da_addr, +				   (struct sockaddr *)&da2->da_addr)) +			return false;  	} -	return NULL; +	if (da1 == NULL && da2 == NULL) +		return true; + +	return false;  }  /* - * Compare two lists of addresses. + * Lookup DS by addresses.  nfs4_ds_cache_lock is held   */ -static bool -_data_server_match_all_addrs_locked(struct list_head *dsaddrs1, -				    struct list_head *dsaddrs2) +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(const struct list_head *dsaddrs)  { -	struct nfs4_pnfs_ds_addr *da1, *da2; -	size_t count1 = 0, -	       count2 = 0; - -	list_for_each_entry(da1, dsaddrs1, da_node) -		count1++; - -	list_for_each_entry(da2, dsaddrs2, da_node) { -		bool found = false; -		count2++; -		list_for_each_entry(da1, dsaddrs1, da_node) { -			if (same_sockaddr((struct sockaddr *)&da1->da_addr, -				(struct sockaddr *)&da2->da_addr)) { -				found = true; -				break; -			} -		} -		if (!found) -			return false; -	} +	struct nfs4_pnfs_ds *ds; -	return (count1 == count2); +	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) +		if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) +			return ds; +	return NULL;  }  /* @@ -356,11 +338,6 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)  		dprintk("%s add new data server %s\n", __func__,  			ds->ds_remotestr);  	} else { -		if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs, -							 dsaddrs)) { -			dprintk("%s:  multipath address mismatch: %s != %s", -				__func__, tmp_ds->ds_remotestr, remotestr); -		}  		kfree(remotestr);  		kfree(ds);  		atomic_inc(&tmp_ds->ds_count); @@ -378,7 +355,7 @@ out:   * Currently only supports ipv4, ipv6 and one multi-path address.   */  static struct nfs4_pnfs_ds_addr * -decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags) +decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)  {  	struct nfs4_pnfs_ds_addr *da = NULL;  	char *buf, *portstr; @@ -457,7 +434,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)  	INIT_LIST_HEAD(&da->da_node); -	if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr, +	if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,  		      sizeof(da->da_addr))) {  		dprintk("%s: error parsing address %s\n", __func__, buf);  		goto out_free_da; @@ -554,7 +531,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)  	cnt = be32_to_cpup(p);  	dprintk("%s stripe count  %d\n", __func__, cnt);  	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { -		printk(KERN_WARNING "%s: stripe count %d greater than " +		printk(KERN_WARNING "NFS: %s: stripe count %d greater than "  		       "supported maximum %d\n", __func__,  			cnt, NFS4_PNFS_MAX_STRIPE_CNT);  		goto out_err_free_scratch; @@ -585,7 +562,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)  	num = be32_to_cpup(p);  	dprintk("%s ds_num %u\n", __func__, num);  	if (num > NFS4_PNFS_MAX_MULTI_CNT) { -		printk(KERN_WARNING "%s: multipath count %d greater than " +		printk(KERN_WARNING "NFS: %s: multipath count %d greater than "  			"supported maximum %d\n", __func__,  			num, NFS4_PNFS_MAX_MULTI_CNT);  		goto out_err_free_stripe_indices; @@ -593,7 +570,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)  	/* validate stripe indices are all < num */  	if (max_stripe_index >= num) { -		printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n", +		printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",  			__func__, max_stripe_index, num);  		goto out_err_free_stripe_indices;  	} @@ -625,7 +602,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)  		mp_count = be32_to_cpup(p); /* multipath count */  		for (j = 0; j < mp_count; j++) { -			da = decode_ds_addr(&stream, gfp_flags); +			da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->net, +					    &stream, gfp_flags);  			if (da)  				list_add_tail(&da->da_node, &dsaddrs);  		} @@ -686,7 +664,7 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl  	new = decode_device(inode, dev, gfp_flags);  	if (!new) { -		printk(KERN_WARNING "%s: Could not decode or add device\n", +		printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",  			__func__);  		return NULL;  	} @@ -835,7 +813,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)  	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];  	if (ds == NULL) { -		printk(KERN_ERR "%s: No data server for offset index %d\n", +		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",  			__func__, ds_idx);  		return NULL;  	} diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index bb80c49b653..9c8eca315f4 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -94,13 +94,14 @@ static int nfs4_validate_fspath(struct dentry *dentry,  }  static size_t nfs_parse_server_name(char *string, size_t len, -		struct sockaddr *sa, size_t salen) +		struct sockaddr *sa, size_t salen, struct nfs_server *server)  { +	struct net *net = rpc_net_ns(server->client);  	ssize_t ret; -	ret = rpc_pton(string, len, sa, salen); +	ret = rpc_pton(net, string, len, sa, salen);  	if (ret == 0) { -		ret = nfs_dns_resolve_name(string, len, sa, salen); +		ret = nfs_dns_resolve_name(net, string, len, sa, salen);  		if (ret < 0)  			ret = 0;  	} @@ -137,7 +138,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,  			continue;  		mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, -				mountdata->addr, addr_bufsize); +				mountdata->addr, addr_bufsize, +				NFS_SB(mountdata->sb));  		if (mountdata->addrlen == 0)  			continue; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f0c849c98fe..e809d2305eb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -72,18 +72,21 @@  #define NFS4_MAX_LOOP_ON_RECOVER (10) +static unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; +  struct nfs4_opendata;  static int _nfs4_proc_open(struct nfs4_opendata *data);  static int _nfs4_recover_proc_open(struct nfs4_opendata *data);  static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);  static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);  static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);  static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,  			    struct nfs_fattr *fattr, struct iattr *sattr,  			    struct nfs4_state *state);  #ifdef CONFIG_NFS_V4_1 -static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *); -static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *); +static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *); +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *);  #endif  /* Prevent leaks of NFSv4 errors into userland */  static int nfs4_map_errors(int err) @@ -193,7 +196,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent  	 * when talking to the server, we always send cookie 0  	 * instead of 1 or 2.  	 */ -	start = p = kmap_atomic(*readdir->pages, KM_USER0); +	start = p = kmap_atomic(*readdir->pages);  	if (cookie == 0) {  		*p++ = xdr_one;                                  /* next */ @@ -221,7 +224,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent  	readdir->pgbase = (char *)p - (char *)start;  	readdir->count -= readdir->pgbase; -	kunmap_atomic(start, KM_USER0); +	kunmap_atomic(start);  }  static int nfs4_wait_clnt_recover(struct nfs_client *clp) @@ -259,15 +262,28 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc  {  	struct nfs_client *clp = server->nfs_client;  	struct nfs4_state *state = exception->state; +	struct inode *inode = exception->inode;  	int ret = errorcode;  	exception->retry = 0;  	switch(errorcode) {  		case 0:  			return 0; +		case -NFS4ERR_OPENMODE: +			if (nfs_have_delegation(inode, FMODE_READ)) { +				nfs_inode_return_delegation(inode); +				exception->retry = 1; +				return 0; +			} +			if (state == NULL) +				break; +			nfs4_schedule_stateid_recovery(server, state); +			goto wait_on_recovery; +		case -NFS4ERR_DELEG_REVOKED:  		case -NFS4ERR_ADMIN_REVOKED:  		case -NFS4ERR_BAD_STATEID: -		case -NFS4ERR_OPENMODE: +			if (state != NULL) +				nfs_remove_bad_delegation(state->inode);  			if (state == NULL)  				break;  			nfs4_schedule_stateid_recovery(server, state); @@ -360,16 +376,14 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp   * When updating highest_used_slotid there may be "holes" in the bitmap   * so we need to scan down from highest_used_slotid to 0 looking for the now   * highest slotid in use. - * If none found, highest_used_slotid is set to -1. + * If none found, highest_used_slotid is set to NFS4_NO_SLOT.   *   * Must be called while holding tbl->slot_tbl_lock   */  static void -nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid) +nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid)  { -	int slotid = free_slotid; - -	BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE); +	BUG_ON(slotid >= NFS4_MAX_SLOT_TABLE);  	/* clear used bit in bitmap */  	__clear_bit(slotid, tbl->used_slots); @@ -379,10 +393,16 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)  		if (slotid < tbl->max_slots)  			tbl->highest_used_slotid = slotid;  		else -			tbl->highest_used_slotid = -1; +			tbl->highest_used_slotid = NFS4_NO_SLOT;  	} -	dprintk("%s: free_slotid %u highest_used_slotid %d\n", __func__, -		free_slotid, tbl->highest_used_slotid); +	dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, +		slotid, tbl->highest_used_slotid); +} + +bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy) +{ +	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +	return true;  }  /* @@ -390,16 +410,13 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)   */  static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)  { -	struct rpc_task *task; -  	if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { -		task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); -		if (task) -			rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +		rpc_wake_up_first(&ses->fc_slot_table.slot_tbl_waitq, +				nfs4_set_task_privileged, NULL);  		return;  	} -	if (ses->fc_slot_table.highest_used_slotid != -1) +	if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT)  		return;  	dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__); @@ -412,7 +429,7 @@ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)  void nfs4_check_drain_bc_complete(struct nfs4_session *ses)  {  	if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) || -	    ses->bc_slot_table.highest_used_slotid != -1) +	    ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT)  		return;  	dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);  	complete(&ses->bc_slot_table.complete); @@ -507,25 +524,25 @@ static int nfs4_sequence_done(struct rpc_task *task,   * nfs4_find_slot looks for an unset bit in the used_slots bitmap.   * If found, we mark the slot as used, update the highest_used_slotid,   * and respectively set up the sequence operation args. - * The slot number is returned if found, or NFS4_MAX_SLOT_TABLE otherwise. + * The slot number is returned if found, or NFS4_NO_SLOT otherwise.   *   * Note: must be called with under the slot_tbl_lock.   */ -static u8 +static u32  nfs4_find_slot(struct nfs4_slot_table *tbl)  { -	int slotid; -	u8 ret_id = NFS4_MAX_SLOT_TABLE; -	BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE); +	u32 slotid; +	u32 ret_id = NFS4_NO_SLOT; -	dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n", +	dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",  		__func__, tbl->used_slots[0], tbl->highest_used_slotid,  		tbl->max_slots);  	slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);  	if (slotid >= tbl->max_slots)  		goto out;  	__set_bit(slotid, tbl->used_slots); -	if (slotid > tbl->highest_used_slotid) +	if (slotid > tbl->highest_used_slotid || +			tbl->highest_used_slotid == NFS4_NO_SLOT)  		tbl->highest_used_slotid = slotid;  	ret_id = slotid;  out: @@ -534,15 +551,25 @@ out:  	return ret_id;  } +static void nfs41_init_sequence(struct nfs4_sequence_args *args, +		struct nfs4_sequence_res *res, int cache_reply) +{ +	args->sa_session = NULL; +	args->sa_cache_this = 0; +	if (cache_reply) +		args->sa_cache_this = 1; +	res->sr_session = NULL; +	res->sr_slot = NULL; +} +  int nfs41_setup_sequence(struct nfs4_session *session,  				struct nfs4_sequence_args *args,  				struct nfs4_sequence_res *res, -				int cache_reply,  				struct rpc_task *task)  {  	struct nfs4_slot *slot;  	struct nfs4_slot_table *tbl; -	u8 slotid; +	u32 slotid;  	dprintk("--> %s\n", __func__);  	/* slot already allocated? */ @@ -570,7 +597,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,  	}  	slotid = nfs4_find_slot(tbl); -	if (slotid == NFS4_MAX_SLOT_TABLE) { +	if (slotid == NFS4_NO_SLOT) {  		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);  		spin_unlock(&tbl->slot_tbl_lock);  		dprintk("<-- %s: no free slots\n", __func__); @@ -582,7 +609,6 @@ int nfs41_setup_sequence(struct nfs4_session *session,  	slot = tbl->slots + slotid;  	args->sa_session = session;  	args->sa_slotid = slotid; -	args->sa_cache_this = cache_reply;  	dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); @@ -602,24 +628,19 @@ EXPORT_SYMBOL_GPL(nfs41_setup_sequence);  int nfs4_setup_sequence(const struct nfs_server *server,  			struct nfs4_sequence_args *args,  			struct nfs4_sequence_res *res, -			int cache_reply,  			struct rpc_task *task)  {  	struct nfs4_session *session = nfs4_get_session(server);  	int ret = 0; -	if (session == NULL) { -		args->sa_session = NULL; -		res->sr_session = NULL; +	if (session == NULL)  		goto out; -	}  	dprintk("--> %s clp %p session %p sr_slot %td\n",  		__func__, session->clp, session, res->sr_slot ?  			res->sr_slot - session->fc_slot_table.slots : -1); -	ret = nfs41_setup_sequence(session, args, res, cache_reply, -				   task); +	ret = nfs41_setup_sequence(session, args, res, task);  out:  	dprintk("<-- %s status=%d\n", __func__, ret);  	return ret; @@ -629,7 +650,6 @@ struct nfs41_call_sync_data {  	const struct nfs_server *seq_server;  	struct nfs4_sequence_args *seq_args;  	struct nfs4_sequence_res *seq_res; -	int cache_reply;  };  static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) @@ -639,7 +659,7 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)  	dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);  	if (nfs4_setup_sequence(data->seq_server, data->seq_args, -				data->seq_res, data->cache_reply, task)) +				data->seq_res, task))  		return;  	rpc_call_start(task);  } @@ -657,12 +677,12 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)  	nfs41_sequence_done(task, data->seq_res);  } -struct rpc_call_ops nfs41_call_sync_ops = { +static const struct rpc_call_ops nfs41_call_sync_ops = {  	.rpc_call_prepare = nfs41_call_sync_prepare,  	.rpc_call_done = nfs41_call_sync_done,  }; -struct rpc_call_ops nfs41_call_priv_sync_ops = { +static const struct rpc_call_ops nfs41_call_priv_sync_ops = {  	.rpc_call_prepare = nfs41_call_priv_sync_prepare,  	.rpc_call_done = nfs41_call_sync_done,  }; @@ -672,7 +692,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,  				   struct rpc_message *msg,  				   struct nfs4_sequence_args *args,  				   struct nfs4_sequence_res *res, -				   int cache_reply,  				   int privileged)  {  	int ret; @@ -681,7 +700,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,  		.seq_server = server,  		.seq_args = args,  		.seq_res = res, -		.cache_reply = cache_reply,  	};  	struct rpc_task_setup task_setup = {  		.rpc_client = clnt, @@ -690,7 +708,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,  		.callback_data = &data  	}; -	res->sr_slot = NULL;  	if (privileged)  		task_setup.callback_ops = &nfs41_call_priv_sync_ops;  	task = rpc_run_task(&task_setup); @@ -710,10 +727,17 @@ int _nfs4_call_sync_session(struct rpc_clnt *clnt,  			    struct nfs4_sequence_res *res,  			    int cache_reply)  { -	return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0); +	nfs41_init_sequence(args, res, cache_reply); +	return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0);  }  #else +static inline +void nfs41_init_sequence(struct nfs4_sequence_args *args, +		struct nfs4_sequence_res *res, int cache_reply) +{ +} +  static int nfs4_sequence_done(struct rpc_task *task,  			       struct nfs4_sequence_res *res)  { @@ -728,7 +752,7 @@ int _nfs4_call_sync(struct rpc_clnt *clnt,  		    struct nfs4_sequence_res *res,  		    int cache_reply)  { -	args->sa_session = res->sr_session = NULL; +	nfs41_init_sequence(args, res, cache_reply);  	return rpc_call_sync(clnt, msg, 0);  } @@ -815,20 +839,22 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,  	p->o_arg.open_flags = flags;  	p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);  	p->o_arg.clientid = server->nfs_client->cl_clientid; -	p->o_arg.id = sp->so_owner_id.id; +	p->o_arg.id = sp->so_seqid.owner_id;  	p->o_arg.name = &dentry->d_name;  	p->o_arg.server = server;  	p->o_arg.bitmask = server->attr_bitmask;  	p->o_arg.dir_bitmask = server->cache_consistency_bitmask;  	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; -	if (flags & O_CREAT) { -		u32 *s; +	if (attrs != NULL && attrs->ia_valid != 0) { +		__be32 verf[2];  		p->o_arg.u.attrs = &p->attrs;  		memcpy(&p->attrs, attrs, sizeof(p->attrs)); -		s = (u32 *) p->o_arg.u.verifier.data; -		s[0] = jiffies; -		s[1] = current->pid; + +		verf[0] = jiffies; +		verf[1] = current->pid; +		memcpy(p->o_arg.u.verifier.data, verf, +				sizeof(p->o_arg.u.verifier.data));  	}  	p->c_arg.fh = &p->o_res.fh;  	p->c_arg.stateid = &p->o_res.stateid; @@ -878,7 +904,7 @@ static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode  {  	int ret = 0; -	if (open_mode & O_EXCL) +	if (open_mode & (O_EXCL|O_TRUNC))  		goto out;  	switch (mode & (FMODE_READ|FMODE_WRITE)) {  		case FMODE_READ: @@ -927,8 +953,8 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)  static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)  {  	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) -		memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); -	memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); +		nfs4_stateid_copy(&state->stateid, stateid); +	nfs4_stateid_copy(&state->open_stateid, stateid);  	switch (fmode) {  		case FMODE_READ:  			set_bit(NFS_O_RDONLY_STATE, &state->flags); @@ -956,7 +982,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s  	 */  	write_seqlock(&state->seqlock);  	if (deleg_stateid != NULL) { -		memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); +		nfs4_stateid_copy(&state->stateid, deleg_stateid);  		set_bit(NFS_DELEGATED_STATE, &state->flags);  	}  	if (open_stateid != NULL) @@ -987,7 +1013,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat  	if (delegation == NULL)  		delegation = &deleg_cur->stateid; -	else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) +	else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation))  		goto no_delegation_unlock;  	nfs_mark_delegation_referenced(deleg_cur); @@ -1026,7 +1052,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)  	struct nfs4_state *state = opendata->state;  	struct nfs_inode *nfsi = NFS_I(state->inode);  	struct nfs_delegation *delegation; -	int open_mode = opendata->o_arg.open_flags & O_EXCL; +	int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC);  	fmode_t fmode = opendata->o_arg.fmode;  	nfs4_stateid stateid;  	int ret = -EAGAIN; @@ -1048,7 +1074,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)  			break;  		}  		/* Save the delegation */ -		memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); +		nfs4_stateid_copy(&stateid, &delegation->stateid);  		rcu_read_unlock();  		ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);  		if (ret != 0) @@ -1090,6 +1116,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data  	if (state == NULL)  		goto err_put_inode;  	if (data->o_res.delegation_type != 0) { +		struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;  		int delegation_flags = 0;  		rcu_read_lock(); @@ -1101,7 +1128,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data  			pr_err_ratelimited("NFS: Broken NFSv4 server %s is "  					"returning a delegation for "  					"OPEN(CLAIM_DELEGATE_CUR)\n", -					NFS_CLIENT(inode)->cl_server); +					clp->cl_hostname);  		} else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)  			nfs_inode_set_delegation(state->inode,  					data->owner->so_cred, @@ -1210,10 +1237,10 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *  	 * Check if we need to update the current stateid.  	 */  	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && -	    memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { +	    !nfs4_stateid_match(&state->stateid, &state->open_stateid)) {  		write_seqlock(&state->seqlock);  		if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) -			memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); +			nfs4_stateid_copy(&state->stateid, &state->open_stateid);  		write_sequnlock(&state->seqlock);  	}  	return 0; @@ -1282,8 +1309,7 @@ static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs  	if (IS_ERR(opendata))  		return PTR_ERR(opendata);  	opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; -	memcpy(opendata->o_arg.u.delegation.data, stateid->data, -			sizeof(opendata->o_arg.u.delegation.data)); +	nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);  	ret = nfs4_open_recover(opendata, state);  	nfs4_opendata_put(opendata);  	return ret; @@ -1319,8 +1345,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state  				 * The show must go on: exit, but mark the  				 * stateid as needing recovery.  				 */ +			case -NFS4ERR_DELEG_REVOKED:  			case -NFS4ERR_ADMIN_REVOKED:  			case -NFS4ERR_BAD_STATEID: +				nfs_inode_find_state_and_recover(state->inode, +						stateid);  				nfs4_schedule_stateid_recovery(server, state);  			case -EKEYEXPIRED:  				/* @@ -1345,8 +1374,7 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)  	data->rpc_status = task->tk_status;  	if (data->rpc_status == 0) { -		memcpy(data->o_res.stateid.data, data->c_res.stateid.data, -				sizeof(data->o_res.stateid.data)); +		nfs4_stateid_copy(&data->o_res.stateid, &data->c_res.stateid);  		nfs_confirm_seqid(&data->owner->so_seqid, 0);  		renew_lease(data->o_res.server, data->timestamp);  		data->rpc_done = 1; @@ -1440,7 +1468,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)  		rcu_read_unlock();  	}  	/* Update sequence id. */ -	data->o_arg.id = sp->so_owner_id.id; +	data->o_arg.id = sp->so_seqid.owner_id;  	data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;  	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {  		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; @@ -1449,7 +1477,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)  	data->timestamp = jiffies;  	if (nfs4_setup_sequence(data->o_arg.server,  				&data->o_arg.seq_args, -				&data->o_res.seq_res, 1, task)) +				&data->o_res.seq_res, task))  		return;  	rpc_call_start(task);  	return; @@ -1551,6 +1579,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)  	};  	int status; +	nfs41_init_sequence(&o_arg->seq_args, &o_res->seq_res, 1);  	kref_get(&data->kref);  	data->rpc_done = 0;  	data->rpc_status = 0; @@ -1712,15 +1741,32 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta  }  #if defined(CONFIG_NFS_V4_1) -static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +static int nfs41_check_expired_stateid(struct nfs4_state *state, nfs4_stateid *stateid, unsigned int flags)  { -	int status; +	int status = NFS_OK;  	struct nfs_server *server = NFS_SERVER(state->inode); -	status = nfs41_test_stateid(server, state); -	if (status == NFS_OK) -		return 0; -	nfs41_free_stateid(server, state); +	if (state->flags & flags) { +		status = nfs41_test_stateid(server, stateid); +		if (status != NFS_OK) { +			nfs41_free_stateid(server, stateid); +			state->flags &= ~flags; +		} +	} +	return status; +} + +static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ +	int deleg_status, open_status; +	int deleg_flags = 1 << NFS_DELEGATED_STATE; +	int open_flags = (1 << NFS_O_RDONLY_STATE) | (1 << NFS_O_WRONLY_STATE) | (1 << NFS_O_RDWR_STATE); + +	deleg_status = nfs41_check_expired_stateid(state, &state->stateid, deleg_flags); +	open_status = nfs41_check_expired_stateid(state,  &state->open_stateid, open_flags); + +	if ((deleg_status == NFS_OK) && (open_status == NFS_OK)) +		return NFS_OK;  	return nfs4_open_expired(sp, state);  }  #endif @@ -1754,7 +1800,8 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode  	/* Protect against reboot recovery conflicts */  	status = -ENOMEM; -	if (!(sp = nfs4_get_state_owner(server, cred))) { +	sp = nfs4_get_state_owner(server, cred, GFP_KERNEL); +	if (sp == NULL) {  		dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");  		goto out_err;  	} @@ -1829,7 +1876,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry,  		 * the user though...  		 */  		if (status == -NFS4ERR_BAD_SEQID) { -			printk(KERN_WARNING "NFS: v4 server %s " +			pr_warn_ratelimited("NFS: v4 server %s "  					" returned a bad sequence-id error!\n",  					NFS_SERVER(dir)->nfs_client->cl_hostname);  			exception.retry = 1; @@ -1882,12 +1929,14 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,  	nfs_fattr_init(fattr); -	if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { +	if (state != NULL) { +		nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, +				current->files, current->tgid); +	} else if (nfs4_copy_delegation_stateid(&arg.stateid, inode, +				FMODE_WRITE)) {  		/* Use that stateid */ -	} else if (state != NULL) { -		nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid);  	} else -		memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); +		nfs4_stateid_copy(&arg.stateid, &zero_stateid);  	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);  	if (status == 0 && state != NULL) @@ -1900,7 +1949,10 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,  			   struct nfs4_state *state)  {  	struct nfs_server *server = NFS_SERVER(inode); -	struct nfs4_exception exception = { }; +	struct nfs4_exception exception = { +		.state = state, +		.inode = inode, +	};  	int err;  	do {  		err = nfs4_handle_exception(server, @@ -1954,6 +2006,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)  	struct nfs4_state *state = calldata->state;  	struct nfs_server *server = NFS_SERVER(calldata->inode); +	dprintk("%s: begin!\n", __func__);  	if (!nfs4_sequence_done(task, &calldata->res.seq_res))  		return;          /* hmm. we are done with the inode, and in the process of freeing @@ -1981,6 +2034,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)  	}  	nfs_release_seqid(calldata->arg.seqid);  	nfs_refresh_inode(calldata->inode, calldata->res.fattr); +	dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);  }  static void nfs4_close_prepare(struct rpc_task *task, void *data) @@ -1989,6 +2043,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)  	struct nfs4_state *state = calldata->state;  	int call_close = 0; +	dprintk("%s: begin!\n", __func__);  	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)  		return; @@ -2013,7 +2068,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)  	if (!call_close) {  		/* Note: exit _without_ calling nfs4_close_done */  		task->tk_action = NULL; -		return; +		goto out;  	}  	if (calldata->arg.fmode == 0) { @@ -2022,17 +2077,20 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)  		    pnfs_roc_drain(calldata->inode, &calldata->roc_barrier)) {  			rpc_sleep_on(&NFS_SERVER(calldata->inode)->roc_rpcwaitq,  				     task, NULL); -			return; +			goto out;  		}  	}  	nfs_fattr_init(calldata->res.fattr);  	calldata->timestamp = jiffies;  	if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), -				&calldata->arg.seq_args, &calldata->res.seq_res, -				1, task)) -		return; +				&calldata->arg.seq_args, +				&calldata->res.seq_res, +				task)) +		goto out;  	rpc_call_start(task); +out: +	dprintk("%s: done!\n", __func__);  }  static const struct rpc_call_ops nfs4_close_ops = { @@ -2074,6 +2132,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc)  	calldata = kzalloc(sizeof(*calldata), gfp_mask);  	if (calldata == NULL)  		goto out; +	nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 1);  	calldata->inode = state->inode;  	calldata->state = state;  	calldata->arg.fh = NFS_FH(state->inode); @@ -2182,6 +2241,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f  		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;  		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;  		server->acl_bitmask = res.acl_bitmask; +		server->fh_expire_type = res.fh_expire_type;  	}  	return status; @@ -2303,7 +2363,6 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,  	return nfs4_map_errors(status);  } -static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);  /*   * Get locations and (maybe) other attributes of a referral.   * Note that we'll actually follow the referral later when @@ -2420,6 +2479,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,  		}  	} +	/* Deal with open(O_TRUNC) */ +	if (sattr->ia_valid & ATTR_OPEN) +		sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); +  	status = nfs4_do_setattr(inode, cred, fattr, sattr, state);  	if (status == 0)  		nfs_setattr_update_inode(inode, sattr); @@ -2494,7 +2557,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry  	struct nfs_server *server = NFS_SERVER(inode);  	struct nfs4_accessargs args = {  		.fh = NFS_FH(inode), -		.bitmask = server->attr_bitmask, +		.bitmask = server->cache_consistency_bitmask,  	};  	struct nfs4_accessres res = {  		.server = server, @@ -2712,8 +2775,18 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)  	args->bitmask = server->cache_consistency_bitmask;  	res->server = server; -	res->seq_res.sr_slot = NULL;  	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; +	nfs41_init_sequence(&args->seq_args, &res->seq_res, 1); +} + +static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ +	if (nfs4_setup_sequence(NFS_SERVER(data->dir), +				&data->args.seq_args, +				&data->res.seq_res, +				task)) +		return; +	rpc_call_start(task);  }  static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) @@ -2738,6 +2811,17 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)  	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME];  	arg->bitmask = server->attr_bitmask;  	res->server = server; +	nfs41_init_sequence(&arg->seq_args, &res->seq_res, 1); +} + +static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ +	if (nfs4_setup_sequence(NFS_SERVER(data->old_dir), +				&data->args.seq_args, +				&data->res.seq_res, +				task)) +		return; +	rpc_call_start(task);  }  static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, @@ -3232,6 +3316,17 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message  	data->timestamp   = jiffies;  	data->read_done_cb = nfs4_read_done_cb;  	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; +	nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); +} + +static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +{ +	if (nfs4_setup_sequence(NFS_SERVER(data->inode), +				&data->args.seq_args, +				&data->res.seq_res, +				task)) +		return; +	rpc_call_start(task);  }  /* Reset the the nfs_read_data to send the read to the MDS. */ @@ -3305,6 +3400,17 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag  	data->timestamp   = jiffies;  	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; +	nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); +} + +static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) +{ +	if (nfs4_setup_sequence(NFS_SERVER(data->inode), +				&data->args.seq_args, +				&data->res.seq_res, +				task)) +		return; +	rpc_call_start(task);  }  static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data) @@ -3339,6 +3445,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa  		data->write_done_cb = nfs4_commit_done_cb;  	data->res.server = server;  	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; +	nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);  }  struct nfs4_renewdata { @@ -3575,8 +3682,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu  	}  	if (npages > 1) {  		/* for decoding across pages */ -		args.acl_scratch = alloc_page(GFP_KERNEL); -		if (!args.acl_scratch) +		res.acl_scratch = alloc_page(GFP_KERNEL); +		if (!res.acl_scratch)  			goto out_free;  	}  	args.acl_len = npages * PAGE_SIZE; @@ -3612,8 +3719,8 @@ out_free:  	for (i = 0; i < npages; i++)  		if (pages[i])  			__free_page(pages[i]); -	if (args.acl_scratch) -		__free_page(args.acl_scratch); +	if (res.acl_scratch) +		__free_page(res.acl_scratch);  	return ret;  } @@ -3714,8 +3821,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,  	if (task->tk_status >= 0)  		return 0;  	switch(task->tk_status) { +		case -NFS4ERR_DELEG_REVOKED:  		case -NFS4ERR_ADMIN_REVOKED:  		case -NFS4ERR_BAD_STATEID: +			if (state != NULL) +				nfs_remove_bad_delegation(state->inode);  		case -NFS4ERR_OPENMODE:  			if (state == NULL)  				break; @@ -3764,6 +3874,16 @@ wait_on_recovery:  	return -EAGAIN;  } +static void nfs4_construct_boot_verifier(struct nfs_client *clp, +					 nfs4_verifier *bootverf) +{ +	__be32 verf[2]; + +	verf[0] = htonl((u32)clp->cl_boot_time.tv_sec); +	verf[1] = htonl((u32)clp->cl_boot_time.tv_nsec); +	memcpy(bootverf->data, verf, sizeof(bootverf->data)); +} +  int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,  		unsigned short port, struct rpc_cred *cred,  		struct nfs4_setclientid_res *res) @@ -3780,15 +3900,13 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,  		.rpc_resp = res,  		.rpc_cred = cred,  	}; -	__be32 *p;  	int loop = 0;  	int status; -	p = (__be32*)sc_verifier.data; -	*p++ = htonl((u32)clp->cl_boot_time.tv_sec); -	*p = htonl((u32)clp->cl_boot_time.tv_nsec); +	nfs4_construct_boot_verifier(clp, &sc_verifier);  	for(;;) { +		rcu_read_lock();  		setclientid.sc_name_len = scnprintf(setclientid.sc_name,  				sizeof(setclientid.sc_name), "%s/%s %s %s %u",  				clp->cl_ipaddr, @@ -3805,6 +3923,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,  		setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,  				sizeof(setclientid.sc_uaddr), "%s.%u.%u",  				clp->cl_ipaddr, port >> 8, port & 255); +		rcu_read_unlock();  		status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);  		if (status != -NFS4ERR_CLID_INUSE) @@ -3891,7 +4010,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)  	if (nfs4_setup_sequence(d_data->res.server,  				&d_data->args.seq_args, -				&d_data->res.seq_res, 1, task)) +				&d_data->res.seq_res, task))  		return;  	rpc_call_start(task);  } @@ -3925,11 +4044,12 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co  	data = kzalloc(sizeof(*data), GFP_NOFS);  	if (data == NULL)  		return -ENOMEM; +	nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);  	data->args.fhandle = &data->fh;  	data->args.stateid = &data->stateid;  	data->args.bitmask = server->attr_bitmask;  	nfs_copy_fh(&data->fh, NFS_FH(inode)); -	memcpy(&data->stateid, stateid, sizeof(data->stateid)); +	nfs4_stateid_copy(&data->stateid, stateid);  	data->res.fattr = &data->fattr;  	data->res.server = server;  	nfs_fattr_init(data->res.fattr); @@ -4016,7 +4136,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock  	if (status != 0)  		goto out;  	lsp = request->fl_u.nfs4_fl.owner; -	arg.lock_owner.id = lsp->ls_id.id; +	arg.lock_owner.id = lsp->ls_seqid.owner_id;  	arg.lock_owner.s_dev = server->s_dev;  	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);  	switch (status) { @@ -4112,9 +4232,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)  		return;  	switch (task->tk_status) {  		case 0: -			memcpy(calldata->lsp->ls_stateid.data, -					calldata->res.stateid.data, -					sizeof(calldata->lsp->ls_stateid.data)); +			nfs4_stateid_copy(&calldata->lsp->ls_stateid, +					&calldata->res.stateid);  			renew_lease(calldata->server, calldata->timestamp);  			break;  		case -NFS4ERR_BAD_STATEID: @@ -4142,7 +4261,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)  	calldata->timestamp = jiffies;  	if (nfs4_setup_sequence(calldata->server,  				&calldata->arg.seq_args, -				&calldata->res.seq_res, 1, task)) +				&calldata->res.seq_res, task))  		return;  	rpc_call_start(task);  } @@ -4182,6 +4301,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,  		return ERR_PTR(-ENOMEM);  	} +	nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);  	msg.rpc_argp = &data->arg;  	msg.rpc_resp = &data->res;  	task_setup_data.callback_data = data; @@ -4261,7 +4381,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,  		goto out_free_seqid;  	p->arg.lock_stateid = &lsp->ls_stateid;  	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; -	p->arg.lock_owner.id = lsp->ls_id.id; +	p->arg.lock_owner.id = lsp->ls_seqid.owner_id;  	p->arg.lock_owner.s_dev = server->s_dev;  	p->res.lock_seqid = p->arg.lock_seqid;  	p->lsp = lsp; @@ -4297,7 +4417,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)  	data->timestamp = jiffies;  	if (nfs4_setup_sequence(data->server,  				&data->arg.seq_args, -				&data->res.seq_res, 1, task)) +				&data->res.seq_res, task))  		return;  	rpc_call_start(task);  	dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); @@ -4326,8 +4446,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)  			goto out;  	}  	if (data->rpc_status == 0) { -		memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, -					sizeof(data->lsp->ls_stateid.data)); +		nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid);  		data->lsp->ls_flags |= NFS_LOCK_INITIALIZED;  		renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);  	} @@ -4415,6 +4534,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f  			data->arg.reclaim = NFS_LOCK_RECLAIM;  		task_setup_data.callback_ops = &nfs4_recover_lock_ops;  	} +	nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);  	msg.rpc_argp = &data->arg;  	msg.rpc_resp = &data->res;  	task_setup_data.callback_data = data; @@ -4479,15 +4599,34 @@ out:  }  #if defined(CONFIG_NFS_V4_1) -static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) +static int nfs41_check_expired_locks(struct nfs4_state *state)  { -	int status; +	int status, ret = NFS_OK; +	struct nfs4_lock_state *lsp;  	struct nfs_server *server = NFS_SERVER(state->inode); -	status = nfs41_test_stateid(server, state); +	list_for_each_entry(lsp, &state->lock_states, ls_locks) { +		if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { +			status = nfs41_test_stateid(server, &lsp->ls_stateid); +			if (status != NFS_OK) { +				nfs41_free_stateid(server, &lsp->ls_stateid); +				lsp->ls_flags &= ~NFS_LOCK_INITIALIZED; +				ret = status; +			} +		} +	}; + +	return ret; +} + +static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) +{ +	int status = NFS_OK; + +	if (test_bit(LK_STATE_IN_USE, &state->flags)) +		status = nfs41_check_expired_locks(state);  	if (status == NFS_OK) -		return 0; -	nfs41_free_stateid(server, state); +		return status;  	return nfs4_lock_expired(state, request);  }  #endif @@ -4523,7 +4662,8 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock  	/* Note: we always want to sleep here! */  	request->fl_flags = fl_flags | FL_SLEEP;  	if (do_vfs_lock(request->fl_file, request) < 0) -		printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); +		printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock " +			"manager!\n", __func__);  out_unlock:  	up_read(&nfsi->rwsem);  out: @@ -4533,7 +4673,9 @@ out:  static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)  { -	struct nfs4_exception exception = { }; +	struct nfs4_exception exception = { +		.state = state, +	};  	int err;  	do { @@ -4603,8 +4745,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)  		err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);  		switch (err) {  			default: -				printk(KERN_ERR "%s: unhandled error %d.\n", -						__func__, err); +				printk(KERN_ERR "NFS: %s: unhandled error " +					"%d.\n", __func__, err);  			case 0:  			case -ESTALE:  				goto out; @@ -4626,6 +4768,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)  				 * The show must go on: exit, but mark the  				 * stateid as needing recovery.  				 */ +			case -NFS4ERR_DELEG_REVOKED:  			case -NFS4ERR_ADMIN_REVOKED:  			case -NFS4ERR_BAD_STATEID:  			case -NFS4ERR_OPENMODE: @@ -4655,33 +4798,44 @@ out:  	return err;  } +struct nfs_release_lockowner_data { +	struct nfs4_lock_state *lsp; +	struct nfs_server *server; +	struct nfs_release_lockowner_args args; +}; +  static void nfs4_release_lockowner_release(void *calldata)  { +	struct nfs_release_lockowner_data *data = calldata; +	nfs4_free_lock_state(data->server, data->lsp);  	kfree(calldata);  } -const struct rpc_call_ops nfs4_release_lockowner_ops = { +static const struct rpc_call_ops nfs4_release_lockowner_ops = {  	.rpc_release = nfs4_release_lockowner_release,  }; -void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) +int nfs4_release_lockowner(struct nfs4_lock_state *lsp)  {  	struct nfs_server *server = lsp->ls_state->owner->so_server; -	struct nfs_release_lockowner_args *args; +	struct nfs_release_lockowner_data *data;  	struct rpc_message msg = {  		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],  	};  	if (server->nfs_client->cl_mvops->minor_version != 0) -		return; -	args = kmalloc(sizeof(*args), GFP_NOFS); -	if (!args) -		return; -	args->lock_owner.clientid = server->nfs_client->cl_clientid; -	args->lock_owner.id = lsp->ls_id.id; -	args->lock_owner.s_dev = server->s_dev; -	msg.rpc_argp = args; -	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); +		return -EINVAL; +	data = kmalloc(sizeof(*data), GFP_NOFS); +	if (!data) +		return -ENOMEM; +	data->lsp = lsp; +	data->server = server; +	data->args.lock_owner.clientid = server->nfs_client->cl_clientid; +	data->args.lock_owner.id = lsp->ls_seqid.owner_id; +	data->args.lock_owner.s_dev = server->s_dev; +	msg.rpc_argp = &data->args; +	rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); +	return 0;  }  #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" @@ -4727,11 +4881,11 @@ static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)  	if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) ||  	       (fattr->valid & NFS_ATTR_FATTR_FILEID)) &&  	      (fattr->valid & NFS_ATTR_FATTR_FSID) && -	      (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) +	      (fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)))  		return;  	fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | -		NFS_ATTR_FATTR_NLINK; +		NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_V4_REFERRAL;  	fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;  	fattr->nlink = 2;  } @@ -4798,7 +4952,8 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct  	return status;  } -int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) +static int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, +		struct nfs4_secinfo_flavors *flavors)  {  	struct nfs4_exception exception = { };  	int err; @@ -4852,6 +5007,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)  {  	nfs4_verifier verifier;  	struct nfs41_exchange_id_args args = { +		.verifier = &verifier,  		.client = clp,  		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,  	}; @@ -4865,15 +5021,11 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)  		.rpc_resp = &res,  		.rpc_cred = cred,  	}; -	__be32 *p;  	dprintk("--> %s\n", __func__);  	BUG_ON(clp == NULL); -	p = (u32 *)verifier.data; -	*p++ = htonl((u32)clp->cl_boot_time.tv_sec); -	*p = htonl((u32)clp->cl_boot_time.tv_nsec); -	args.verifier = &verifier; +	nfs4_construct_boot_verifier(clp, &verifier);  	args.id_len = scnprintf(args.id, sizeof(args.id),  				"%s/%s.%s/%u", @@ -4883,14 +5035,29 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)  				clp->cl_rpcclient->cl_auth->au_flavor);  	res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL); -	if (unlikely(!res.server_scope)) -		return -ENOMEM; +	if (unlikely(!res.server_scope)) { +		status = -ENOMEM; +		goto out; +	} + +	res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_KERNEL); +	if (unlikely(!res.impl_id)) { +		status = -ENOMEM; +		goto out_server_scope; +	}  	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);  	if (!status)  		status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);  	if (!status) { +		/* use the most recent implementation id */ +		kfree(clp->impl_id); +		clp->impl_id = res.impl_id; +	} else +		kfree(res.impl_id); + +	if (!status) {  		if (clp->server_scope &&  		    !nfs41_same_server_scope(clp->server_scope,  					     res.server_scope)) { @@ -4901,12 +5068,21 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)  			clp->server_scope = NULL;  		} -		if (!clp->server_scope) +		if (!clp->server_scope) {  			clp->server_scope = res.server_scope; -		else -			kfree(res.server_scope); +			goto out; +		}  	} +out_server_scope: +	kfree(res.server_scope); +out: +	if (clp->impl_id) +		dprintk("%s: Server Implementation ID: " +			"domain: %s, name: %s, date: %llu,%u\n", +			__func__, clp->impl_id->domain, clp->impl_id->name, +			clp->impl_id->date.seconds, +			clp->impl_id->date.nseconds);  	dprintk("<-- %s status= %d\n", __func__, status);  	return status;  } @@ -4930,7 +5106,7 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task,  	   since we're invoked within one */  	ret = nfs41_setup_sequence(data->clp->cl_session,  				   &data->args->la_seq_args, -				   &data->res->lr_seq_res, 0, task); +				   &data->res->lr_seq_res, task);  	BUG_ON(ret == -EAGAIN);  	rpc_call_start(task); @@ -4963,7 +5139,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)  	dprintk("<-- %s\n", __func__);  } -struct rpc_call_ops nfs4_get_lease_time_ops = { +static const struct rpc_call_ops nfs4_get_lease_time_ops = {  	.rpc_call_prepare = nfs4_get_lease_time_prepare,  	.rpc_call_done = nfs4_get_lease_time_done,  }; @@ -4994,6 +5170,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)  	};  	int status; +	nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);  	dprintk("--> %s\n", __func__);  	task = rpc_run_task(&task_setup); @@ -5008,37 +5185,53 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)  	return status;  } +static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags) +{ +	return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags); +} + +static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, +		struct nfs4_slot *new, +		u32 max_slots, +		u32 ivalue) +{ +	struct nfs4_slot *old = NULL; +	u32 i; + +	spin_lock(&tbl->slot_tbl_lock); +	if (new) { +		old = tbl->slots; +		tbl->slots = new; +		tbl->max_slots = max_slots; +	} +	tbl->highest_used_slotid = -1;	/* no slot is currently used */ +	for (i = 0; i < tbl->max_slots; i++) +		tbl->slots[i].seq_nr = ivalue; +	spin_unlock(&tbl->slot_tbl_lock); +	kfree(old); +} +  /* - * Reset a slot table + * (re)Initialise a slot table   */ -static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, -				 int ivalue) +static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, +				 u32 ivalue)  {  	struct nfs4_slot *new = NULL; -	int i; -	int ret = 0; +	int ret = -ENOMEM;  	dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,  		max_reqs, tbl->max_slots);  	/* Does the newly negotiated max_reqs match the existing slot table? */  	if (max_reqs != tbl->max_slots) { -		ret = -ENOMEM; -		new = kmalloc(max_reqs * sizeof(struct nfs4_slot), -			      GFP_NOFS); +		new = nfs4_alloc_slots(max_reqs, GFP_NOFS);  		if (!new)  			goto out; -		ret = 0; -		kfree(tbl->slots); -	} -	spin_lock(&tbl->slot_tbl_lock); -	if (new) { -		tbl->slots = new; -		tbl->max_slots = max_reqs;  	} -	for (i = 0; i < tbl->max_slots; ++i) -		tbl->slots[i].seq_nr = ivalue; -	spin_unlock(&tbl->slot_tbl_lock); +	ret = 0; + +	nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue);  	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,  		tbl, tbl->slots, tbl->max_slots);  out: @@ -5061,36 +5254,6 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session)  }  /* - * Initialize slot table - */ -static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, -		int max_slots, int ivalue) -{ -	struct nfs4_slot *slot; -	int ret = -ENOMEM; - -	BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE); - -	dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); - -	slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS); -	if (!slot) -		goto out; -	ret = 0; - -	spin_lock(&tbl->slot_tbl_lock); -	tbl->max_slots = max_slots; -	tbl->slots = slot; -	tbl->highest_used_slotid = -1;  /* no slot is currently used */ -	spin_unlock(&tbl->slot_tbl_lock); -	dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, -		tbl, tbl->slots, tbl->max_slots); -out: -	dprintk("<-- %s: return %d\n", __func__, ret); -	return ret; -} - -/*   * Initialize or reset the forechannel and backchannel tables   */  static int nfs4_setup_session_slot_tables(struct nfs4_session *ses) @@ -5101,25 +5264,16 @@ static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)  	dprintk("--> %s\n", __func__);  	/* Fore channel */  	tbl = &ses->fc_slot_table; -	if (tbl->slots == NULL) { -		status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1); -		if (status) /* -ENOMEM */ -			return status; -	} else { -		status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1); -		if (status) -			return status; -	} +	status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); +	if (status) /* -ENOMEM */ +		return status;  	/* Back channel */  	tbl = &ses->bc_slot_table; -	if (tbl->slots == NULL) { -		status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0); -		if (status) -			/* Fore and back channel share a connection so get -			 * both slot tables or neither */ -			nfs4_destroy_slot_tables(ses); -	} else -		status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0); +	status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); +	if (status && tbl->slots == NULL) +		/* Fore and back channel share a connection so get +		 * both slot tables or neither */ +		nfs4_destroy_slot_tables(ses);  	return status;  } @@ -5133,13 +5287,13 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)  		return NULL;  	tbl = &session->fc_slot_table; -	tbl->highest_used_slotid = -1; +	tbl->highest_used_slotid = NFS4_NO_SLOT;  	spin_lock_init(&tbl->slot_tbl_lock);  	rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");  	init_completion(&tbl->complete);  	tbl = &session->bc_slot_table; -	tbl->highest_used_slotid = -1; +	tbl->highest_used_slotid = NFS4_NO_SLOT;  	spin_lock_init(&tbl->slot_tbl_lock);  	rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");  	init_completion(&tbl->complete); @@ -5152,11 +5306,16 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)  void nfs4_destroy_session(struct nfs4_session *session)  { +	struct rpc_xprt *xprt; +  	nfs4_proc_destroy_session(session); + +	rcu_read_lock(); +	xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt); +	rcu_read_unlock();  	dprintk("%s Destroy backchannel for xprt %p\n", -		__func__, session->clp->cl_rpcclient->cl_xprt); -	xprt_destroy_backchannel(session->clp->cl_rpcclient->cl_xprt, -				NFS41_BC_MIN_CALLBACKS); +		__func__, xprt); +	xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);  	nfs4_destroy_slot_tables(session);  	kfree(session);  } @@ -5184,7 +5343,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)  	args->fc_attrs.max_rqst_sz = mxrqst_sz;  	args->fc_attrs.max_resp_sz = mxresp_sz;  	args->fc_attrs.max_ops = NFS4_MAX_OPS; -	args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs; +	args->fc_attrs.max_reqs = max_session_slots;  	dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "  		"max_ops=%u max_reqs=%u\n", @@ -5224,6 +5383,8 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args  		return -EINVAL;  	if (rcvd->max_reqs == 0)  		return -EINVAL; +	if (rcvd->max_reqs > NFS4_MAX_SLOT_TABLE) +		rcvd->max_reqs = NFS4_MAX_SLOT_TABLE;  	return 0;  } @@ -5239,9 +5400,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args  	if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)  		return -EINVAL;  	/* These would render the backchannel useless: */ -	if (rcvd->max_ops  == 0) +	if (rcvd->max_ops != sent->max_ops)  		return -EINVAL; -	if (rcvd->max_reqs == 0) +	if (rcvd->max_reqs != sent->max_reqs)  		return -EINVAL;  	return 0;  } @@ -5344,7 +5505,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)  	if (status)  		printk(KERN_WARNING -			"Got error %d from the server on DESTROY_SESSION. " +			"NFS: Got error %d from the server on DESTROY_SESSION. "  			"Session has been destroyed regardless...\n", status);  	dprintk("<-- nfs4_proc_destroy_session\n"); @@ -5467,7 +5628,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)  	args = task->tk_msg.rpc_argp;  	res = task->tk_msg.rpc_resp; -	if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) +	if (nfs41_setup_sequence(clp->cl_session, args, res, task))  		return;  	rpc_call_start(task);  } @@ -5499,6 +5660,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_  		nfs_put_client(clp);  		return ERR_PTR(-ENOMEM);  	} +	nfs41_init_sequence(&calldata->args, &calldata->res, 0);  	msg.rpc_argp = &calldata->args;  	msg.rpc_resp = &calldata->res;  	calldata->clp = clp; @@ -5560,7 +5722,7 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)  	rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);  	if (nfs41_setup_sequence(calldata->clp->cl_session,  				&calldata->arg.seq_args, -				&calldata->res.seq_res, 0, task)) +				&calldata->res.seq_res, task))  		return;  	rpc_call_start(task); @@ -5639,6 +5801,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)  	calldata->clp = clp;  	calldata->arg.one_fs = 0; +	nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);  	msg.rpc_argp = &calldata->arg;  	msg.rpc_resp = &calldata->res;  	task_setup_data.callback_data = calldata; @@ -5670,7 +5833,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)  	 * to be no way to prevent it completely.  	 */  	if (nfs4_setup_sequence(server, &lgp->args.seq_args, -				&lgp->res.seq_res, 0, task)) +				&lgp->res.seq_res, task))  		return;  	if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,  					  NFS_I(lgp->args.inode)->layout, @@ -5745,6 +5908,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)  	lgp->res.layoutp = &lgp->args.layout;  	lgp->res.seq_res.sr_slot = NULL; +	nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);  	task = rpc_run_task(&task_setup_data);  	if (IS_ERR(task))  		return PTR_ERR(task); @@ -5765,7 +5929,7 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)  	dprintk("--> %s\n", __func__);  	if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, -				&lrp->res.seq_res, 0, task)) +				&lrp->res.seq_res, task))  		return;  	rpc_call_start(task);  } @@ -5831,6 +5995,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)  	int status;  	dprintk("--> %s\n", __func__); +	nfs41_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);  	task = rpc_run_task(&task_setup_data);  	if (IS_ERR(task))  		return PTR_ERR(task); @@ -5931,7 +6096,7 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)  	struct nfs_server *server = NFS_SERVER(data->args.inode);  	if (nfs4_setup_sequence(server, &data->args.seq_args, -				&data->res.seq_res, 1, task)) +				&data->res.seq_res, task))  		return;  	rpc_call_start(task);  } @@ -6018,6 +6183,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)  		data->args.lastbytewritten,  		data->args.inode->i_ino); +	nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);  	task = rpc_run_task(&task_setup_data);  	if (IS_ERR(task))  		return PTR_ERR(task); @@ -6111,11 +6277,12 @@ out_freepage:  out:  	return err;  } -static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) + +static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)  {  	int status;  	struct nfs41_test_stateid_args args = { -		.stateid = &state->stateid, +		.stateid = stateid,  	};  	struct nfs41_test_stateid_res res;  	struct rpc_message msg = { @@ -6123,28 +6290,31 @@ static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *sta  		.rpc_argp = &args,  		.rpc_resp = &res,  	}; -	args.seq_args.sa_session = res.seq_res.sr_session = NULL; -	status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); + +	nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); +	status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); + +	if (status == NFS_OK) +		return res.status;  	return status;  } -static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) +static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)  {  	struct nfs4_exception exception = { };  	int err;  	do {  		err = nfs4_handle_exception(server, -				_nfs41_test_stateid(server, state), +				_nfs41_test_stateid(server, stateid),  				&exception);  	} while (exception.retry);  	return err;  } -static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state) +static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)  { -	int status;  	struct nfs41_free_stateid_args args = { -		.stateid = &state->stateid, +		.stateid = stateid,  	};  	struct nfs41_free_stateid_res res;  	struct rpc_message msg = { @@ -6153,25 +6323,46 @@ static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *stat  		.rpc_resp = &res,  	}; -	args.seq_args.sa_session = res.seq_res.sr_session = NULL; -	status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); -	return status; +	nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); +	return nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);  } -static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state) +static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)  {  	struct nfs4_exception exception = { };  	int err;  	do {  		err = nfs4_handle_exception(server, -				_nfs4_free_stateid(server, state), +				_nfs4_free_stateid(server, stateid),  				&exception);  	} while (exception.retry);  	return err;  } + +static bool nfs41_match_stateid(const nfs4_stateid *s1, +		const nfs4_stateid *s2) +{ +	if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0) +		return false; + +	if (s1->seqid == s2->seqid) +		return true; +	if (s1->seqid == 0 || s2->seqid == 0) +		return true; + +	return false; +} +  #endif /* CONFIG_NFS_V4_1 */ -struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { +static bool nfs4_match_stateid(const nfs4_stateid *s1, +		const nfs4_stateid *s2) +{ +	return nfs4_stateid_match(s1, s2); +} + + +static const struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {  	.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,  	.state_flag_bit	= NFS_STATE_RECLAIM_REBOOT,  	.recover_open	= nfs4_open_reclaim, @@ -6181,7 +6372,7 @@ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {  };  #if defined(CONFIG_NFS_V4_1) -struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { +static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {  	.owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,  	.state_flag_bit	= NFS_STATE_RECLAIM_REBOOT,  	.recover_open	= nfs4_open_reclaim, @@ -6192,7 +6383,7 @@ struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {  };  #endif /* CONFIG_NFS_V4_1 */ -struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { +static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {  	.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,  	.state_flag_bit	= NFS_STATE_RECLAIM_NOGRACE,  	.recover_open	= nfs4_open_expired, @@ -6202,7 +6393,7 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {  };  #if defined(CONFIG_NFS_V4_1) -struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { +static const struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {  	.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,  	.state_flag_bit	= NFS_STATE_RECLAIM_NOGRACE,  	.recover_open	= nfs41_open_expired, @@ -6212,14 +6403,14 @@ struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = {  };  #endif /* CONFIG_NFS_V4_1 */ -struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = { +static const struct nfs4_state_maintenance_ops nfs40_state_renewal_ops = {  	.sched_state_renewal = nfs4_proc_async_renew,  	.get_state_renewal_cred_locked = nfs4_get_renew_cred_locked,  	.renew_lease = nfs4_proc_renew,  };  #if defined(CONFIG_NFS_V4_1) -struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { +static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {  	.sched_state_renewal = nfs41_proc_async_sequence,  	.get_state_renewal_cred_locked = nfs4_get_machine_cred_locked,  	.renew_lease = nfs4_proc_sequence, @@ -6229,7 +6420,7 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {  static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {  	.minor_version = 0,  	.call_sync = _nfs4_call_sync, -	.validate_stateid = nfs4_validate_delegation_stateid, +	.match_stateid = nfs4_match_stateid,  	.find_root_sec = nfs4_find_root_sec,  	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,  	.nograce_recovery_ops = &nfs40_nograce_recovery_ops, @@ -6240,7 +6431,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {  static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {  	.minor_version = 1,  	.call_sync = _nfs4_call_sync_session, -	.validate_stateid = nfs41_validate_delegation_stateid, +	.match_stateid = nfs41_match_stateid,  	.find_root_sec = nfs41_find_root_sec,  	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,  	.nograce_recovery_ops = &nfs41_nograce_recovery_ops, @@ -6280,9 +6471,11 @@ const struct nfs_rpc_ops nfs_v4_clientops = {  	.create		= nfs4_proc_create,  	.remove		= nfs4_proc_remove,  	.unlink_setup	= nfs4_proc_unlink_setup, +	.unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,  	.unlink_done	= nfs4_proc_unlink_done,  	.rename		= nfs4_proc_rename,  	.rename_setup	= nfs4_proc_rename_setup, +	.rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,  	.rename_done	= nfs4_proc_rename_done,  	.link		= nfs4_proc_link,  	.symlink	= nfs4_proc_symlink, @@ -6296,8 +6489,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = {  	.set_capabilities = nfs4_server_capabilities,  	.decode_dirent	= nfs4_decode_dirent,  	.read_setup	= nfs4_proc_read_setup, +	.read_rpc_prepare = nfs4_proc_read_rpc_prepare,  	.read_done	= nfs4_read_done,  	.write_setup	= nfs4_proc_write_setup, +	.write_rpc_prepare = nfs4_proc_write_rpc_prepare,  	.write_done	= nfs4_write_done,  	.commit_setup	= nfs4_proc_commit_setup,  	.commit_done	= nfs4_commit_done, @@ -6321,6 +6516,10 @@ const struct xattr_handler *nfs4_xattr_handlers[] = {  	NULL  }; +module_param(max_session_slots, ushort, 0644); +MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " +		"requests the client will negotiate"); +  /*   * Local variables:   *  c-basic-offset: 8 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a53f33b4ac3..0f43414eb25 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -146,6 +146,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)  	struct rpc_cred *cred = NULL;  	struct nfs_server *server; +	/* Use machine credentials if available */ +	cred = nfs4_get_machine_cred_locked(clp); +	if (cred != NULL) +		goto out; +  	rcu_read_lock();  	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {  		cred = nfs4_get_renew_cred_server_locked(server); @@ -153,6 +158,8 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)  			break;  	}  	rcu_read_unlock(); + +out:  	return cred;  } @@ -190,30 +197,29 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)  static void nfs4_end_drain_session(struct nfs_client *clp)  {  	struct nfs4_session *ses = clp->cl_session; +	struct nfs4_slot_table *tbl;  	int max_slots;  	if (ses == NULL)  		return; +	tbl = &ses->fc_slot_table;  	if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { -		spin_lock(&ses->fc_slot_table.slot_tbl_lock); -		max_slots = ses->fc_slot_table.max_slots; +		spin_lock(&tbl->slot_tbl_lock); +		max_slots = tbl->max_slots;  		while (max_slots--) { -			struct rpc_task *task; - -			task = rpc_wake_up_next(&ses->fc_slot_table. -						slot_tbl_waitq); -			if (!task) +			if (rpc_wake_up_first(&tbl->slot_tbl_waitq, +						nfs4_set_task_privileged, +						NULL) == NULL)  				break; -			rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);  		} -		spin_unlock(&ses->fc_slot_table.slot_tbl_lock); +		spin_unlock(&tbl->slot_tbl_lock);  	}  }  static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)  {  	spin_lock(&tbl->slot_tbl_lock); -	if (tbl->highest_used_slotid != -1) { +	if (tbl->highest_used_slotid != NFS4_NO_SLOT) {  		INIT_COMPLETION(tbl->complete);  		spin_unlock(&tbl->slot_tbl_lock);  		return wait_for_completion_interruptible(&tbl->complete); @@ -317,62 +323,6 @@ out:  	return cred;  } -static void nfs_alloc_unique_id_locked(struct rb_root *root, -				       struct nfs_unique_id *new, -				       __u64 minval, int maxbits) -{ -	struct rb_node **p, *parent; -	struct nfs_unique_id *pos; -	__u64 mask = ~0ULL; - -	if (maxbits < 64) -		mask = (1ULL << maxbits) - 1ULL; - -	/* Ensure distribution is more or less flat */ -	get_random_bytes(&new->id, sizeof(new->id)); -	new->id &= mask; -	if (new->id < minval) -		new->id += minval; -retry: -	p = &root->rb_node; -	parent = NULL; - -	while (*p != NULL) { -		parent = *p; -		pos = rb_entry(parent, struct nfs_unique_id, rb_node); - -		if (new->id < pos->id) -			p = &(*p)->rb_left; -		else if (new->id > pos->id) -			p = &(*p)->rb_right; -		else -			goto id_exists; -	} -	rb_link_node(&new->rb_node, parent, p); -	rb_insert_color(&new->rb_node, root); -	return; -id_exists: -	for (;;) { -		new->id++; -		if (new->id < minval || (new->id & mask) != new->id) { -			new->id = minval; -			break; -		} -		parent = rb_next(parent); -		if (parent == NULL) -			break; -		pos = rb_entry(parent, struct nfs_unique_id, rb_node); -		if (new->id < pos->id) -			break; -	} -	goto retry; -} - -static void nfs_free_unique_id(struct rb_root *root, struct nfs_unique_id *id) -{ -	rb_erase(&id->rb_node, root); -} -  static struct nfs4_state_owner *  nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)  { @@ -405,6 +355,7 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)  	struct rb_node **p = &server->state_owners.rb_node,  		       *parent = NULL;  	struct nfs4_state_owner *sp; +	int err;  	while (*p != NULL) {  		parent = *p; @@ -421,8 +372,9 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)  			return sp;  		}  	} -	nfs_alloc_unique_id_locked(&server->openowner_id, -					&new->so_owner_id, 1, 64); +	err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id); +	if (err) +		return ERR_PTR(err);  	rb_link_node(&new->so_server_node, parent, p);  	rb_insert_color(&new->so_server_node, &server->state_owners);  	return new; @@ -435,7 +387,23 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)  	if (!RB_EMPTY_NODE(&sp->so_server_node))  		rb_erase(&sp->so_server_node, &server->state_owners); -	nfs_free_unique_id(&server->openowner_id, &sp->so_owner_id); +	ida_remove(&server->openowner_id, sp->so_seqid.owner_id); +} + +static void +nfs4_init_seqid_counter(struct nfs_seqid_counter *sc) +{ +	sc->flags = 0; +	sc->counter = 0; +	spin_lock_init(&sc->lock); +	INIT_LIST_HEAD(&sc->list); +	rpc_init_wait_queue(&sc->wait, "Seqid_waitqueue"); +} + +static void +nfs4_destroy_seqid_counter(struct nfs_seqid_counter *sc) +{ +	rpc_destroy_wait_queue(&sc->wait);  }  /* @@ -444,19 +412,20 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)   *   */  static struct nfs4_state_owner * -nfs4_alloc_state_owner(void) +nfs4_alloc_state_owner(struct nfs_server *server, +		struct rpc_cred *cred, +		gfp_t gfp_flags)  {  	struct nfs4_state_owner *sp; -	sp = kzalloc(sizeof(*sp),GFP_NOFS); +	sp = kzalloc(sizeof(*sp), gfp_flags);  	if (!sp)  		return NULL; +	sp->so_server = server; +	sp->so_cred = get_rpccred(cred);  	spin_lock_init(&sp->so_lock);  	INIT_LIST_HEAD(&sp->so_states); -	rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); -	sp->so_seqid.sequence = &sp->so_sequence; -	spin_lock_init(&sp->so_sequence.lock); -	INIT_LIST_HEAD(&sp->so_sequence.list); +	nfs4_init_seqid_counter(&sp->so_seqid);  	atomic_set(&sp->so_count, 1);  	INIT_LIST_HEAD(&sp->so_lru);  	return sp; @@ -478,7 +447,7 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)  static void nfs4_free_state_owner(struct nfs4_state_owner *sp)  { -	rpc_destroy_wait_queue(&sp->so_sequence.wait); +	nfs4_destroy_seqid_counter(&sp->so_seqid);  	put_rpccred(sp->so_cred);  	kfree(sp);  } @@ -516,7 +485,8 @@ static void nfs4_gc_state_owners(struct nfs_server *server)   * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL.   */  struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, -					      struct rpc_cred *cred) +					      struct rpc_cred *cred, +					      gfp_t gfp_flags)  {  	struct nfs_client *clp = server->nfs_client;  	struct nfs4_state_owner *sp, *new; @@ -526,20 +496,18 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,  	spin_unlock(&clp->cl_lock);  	if (sp != NULL)  		goto out; -	new = nfs4_alloc_state_owner(); +	new = nfs4_alloc_state_owner(server, cred, gfp_flags);  	if (new == NULL)  		goto out; -	new->so_server = server; -	new->so_cred = cred; -	spin_lock(&clp->cl_lock); -	sp = nfs4_insert_state_owner_locked(new); -	spin_unlock(&clp->cl_lock); -	if (sp == new) -		get_rpccred(cred); -	else { -		rpc_destroy_wait_queue(&new->so_sequence.wait); -		kfree(new); -	} +	do { +		if (ida_pre_get(&server->openowner_id, gfp_flags) == 0) +			break; +		spin_lock(&clp->cl_lock); +		sp = nfs4_insert_state_owner_locked(new); +		spin_unlock(&clp->cl_lock); +	} while (sp == ERR_PTR(-EAGAIN)); +	if (sp != new) +		nfs4_free_state_owner(new);  out:  	nfs4_gc_state_owners(server);  	return sp; @@ -795,15 +763,11 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f  {  	struct nfs4_lock_state *lsp;  	struct nfs_server *server = state->owner->so_server; -	struct nfs_client *clp = server->nfs_client;  	lsp = kzalloc(sizeof(*lsp), GFP_NOFS);  	if (lsp == NULL)  		return NULL; -	rpc_init_wait_queue(&lsp->ls_sequence.wait, "lock_seqid_waitqueue"); -	spin_lock_init(&lsp->ls_sequence.lock); -	INIT_LIST_HEAD(&lsp->ls_sequence.list); -	lsp->ls_seqid.sequence = &lsp->ls_sequence; +	nfs4_init_seqid_counter(&lsp->ls_seqid);  	atomic_set(&lsp->ls_count, 1);  	lsp->ls_state = state;  	lsp->ls_owner.lo_type = type; @@ -815,25 +779,22 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f  		lsp->ls_owner.lo_u.posix_owner = fl_owner;  		break;  	default: -		kfree(lsp); -		return NULL; +		goto out_free;  	} -	spin_lock(&clp->cl_lock); -	nfs_alloc_unique_id_locked(&server->lockowner_id, &lsp->ls_id, 1, 64); -	spin_unlock(&clp->cl_lock); +	lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); +	if (lsp->ls_seqid.owner_id < 0) +		goto out_free;  	INIT_LIST_HEAD(&lsp->ls_locks);  	return lsp; +out_free: +	kfree(lsp); +	return NULL;  } -static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) +void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)  { -	struct nfs_server *server = lsp->ls_state->owner->so_server; -	struct nfs_client *clp = server->nfs_client; - -	spin_lock(&clp->cl_lock); -	nfs_free_unique_id(&server->lockowner_id, &lsp->ls_id); -	spin_unlock(&clp->cl_lock); -	rpc_destroy_wait_queue(&lsp->ls_sequence.wait); +	ida_simple_remove(&server->lockowner_id, lsp->ls_seqid.owner_id); +	nfs4_destroy_seqid_counter(&lsp->ls_seqid);  	kfree(lsp);  } @@ -865,7 +826,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_  	}  	spin_unlock(&state->state_lock);  	if (new != NULL) -		nfs4_free_lock_state(new); +		nfs4_free_lock_state(state->owner->so_server, new);  	return lsp;  } @@ -886,9 +847,11 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)  	if (list_empty(&state->lock_states))  		clear_bit(LK_STATE_IN_USE, &state->flags);  	spin_unlock(&state->state_lock); -	if (lsp->ls_flags & NFS_LOCK_INITIALIZED) -		nfs4_release_lockowner(lsp); -	nfs4_free_lock_state(lsp); +	if (lsp->ls_flags & NFS_LOCK_INITIALIZED) { +		if (nfs4_release_lockowner(lsp) == 0) +			return; +	} +	nfs4_free_lock_state(lsp->ls_state->owner->so_server, lsp);  }  static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) @@ -918,7 +881,8 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)  	if (fl->fl_flags & FL_POSIX)  		lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);  	else if (fl->fl_flags & FL_FLOCK) -		lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); +		lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid, +				NFS4_FLOCK_LOCK_TYPE);  	else  		return -EINVAL;  	if (lsp == NULL) @@ -928,28 +892,49 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)  	return 0;  } -/* - * Byte-range lock aware utility to initialize the stateid of read/write - * requests. - */ -void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) +static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, +		fl_owner_t fl_owner, pid_t fl_pid)  {  	struct nfs4_lock_state *lsp; -	int seq; +	bool ret = false; -	do { -		seq = read_seqbegin(&state->seqlock); -		memcpy(dst, &state->stateid, sizeof(*dst)); -	} while (read_seqretry(&state->seqlock, seq));  	if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) -		return; +		goto out;  	spin_lock(&state->state_lock);  	lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); -	if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) -		memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); +	if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) { +		nfs4_stateid_copy(dst, &lsp->ls_stateid); +		ret = true; +	}  	spin_unlock(&state->state_lock);  	nfs4_put_lock_state(lsp); +out: +	return ret; +} + +static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +{ +	int seq; + +	do { +		seq = read_seqbegin(&state->seqlock); +		nfs4_stateid_copy(dst, &state->stateid); +	} while (read_seqretry(&state->seqlock, seq)); +} + +/* + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, +		fmode_t fmode, fl_owner_t fl_owner, pid_t fl_pid) +{ +	if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) +		return; +	if (nfs4_copy_lock_stateid(dst, state, fl_owner, fl_pid)) +		return; +	nfs4_copy_open_stateid(dst, state);  }  struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask) @@ -960,20 +945,28 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m  	if (new != NULL) {  		new->sequence = counter;  		INIT_LIST_HEAD(&new->list); +		new->task = NULL;  	}  	return new;  }  void nfs_release_seqid(struct nfs_seqid *seqid)  { -	if (!list_empty(&seqid->list)) { -		struct rpc_sequence *sequence = seqid->sequence->sequence; +	struct nfs_seqid_counter *sequence; -		spin_lock(&sequence->lock); -		list_del_init(&seqid->list); -		spin_unlock(&sequence->lock); -		rpc_wake_up(&sequence->wait); +	if (list_empty(&seqid->list)) +		return; +	sequence = seqid->sequence; +	spin_lock(&sequence->lock); +	list_del_init(&seqid->list); +	if (!list_empty(&sequence->list)) { +		struct nfs_seqid *next; + +		next = list_first_entry(&sequence->list, +				struct nfs_seqid, list); +		rpc_wake_up_queued_task(&sequence->wait, next->task);  	} +	spin_unlock(&sequence->lock);  }  void nfs_free_seqid(struct nfs_seqid *seqid) @@ -989,14 +982,14 @@ void nfs_free_seqid(struct nfs_seqid *seqid)   */  static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)  { -	BUG_ON(list_first_entry(&seqid->sequence->sequence->list, struct nfs_seqid, list) != seqid); +	BUG_ON(list_first_entry(&seqid->sequence->list, struct nfs_seqid, list) != seqid);  	switch (status) {  		case 0:  			break;  		case -NFS4ERR_BAD_SEQID:  			if (seqid->sequence->flags & NFS_SEQID_CONFIRMED)  				return; -			printk(KERN_WARNING "NFS: v4 server returned a bad" +			pr_warn_ratelimited("NFS: v4 server returned a bad"  					" sequence-id error on an"  					" unconfirmed sequence %p!\n",  					seqid->sequence); @@ -1040,10 +1033,11 @@ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)  int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)  { -	struct rpc_sequence *sequence = seqid->sequence->sequence; +	struct nfs_seqid_counter *sequence = seqid->sequence;  	int status = 0;  	spin_lock(&sequence->lock); +	seqid->task = task;  	if (list_empty(&seqid->list))  		list_add_tail(&seqid->list, &sequence->list);  	if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid) @@ -1072,19 +1066,28 @@ static void nfs4_clear_state_manager_bit(struct nfs_client *clp)  void nfs4_schedule_state_manager(struct nfs_client *clp)  {  	struct task_struct *task; +	char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];  	if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)  		return;  	__module_get(THIS_MODULE);  	atomic_inc(&clp->cl_count); -	task = kthread_run(nfs4_run_state_manager, clp, "%s-manager", -				rpc_peeraddr2str(clp->cl_rpcclient, -							RPC_DISPLAY_ADDR)); -	if (!IS_ERR(task)) -		return; -	nfs4_clear_state_manager_bit(clp); -	nfs_put_client(clp); -	module_put(THIS_MODULE); + +	/* The rcu_read_lock() is not strictly necessary, as the state +	 * manager is the only thread that ever changes the rpc_xprt +	 * after it's initialized.  At this point, we're single threaded. */ +	rcu_read_lock(); +	snprintf(buf, sizeof(buf), "%s-manager", +			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); +	rcu_read_unlock(); +	task = kthread_run(nfs4_run_state_manager, clp, buf); +	if (IS_ERR(task)) { +		printk(KERN_ERR "%s: kthread_run: %ld\n", +			__func__, PTR_ERR(task)); +		nfs4_clear_state_manager_bit(clp); +		nfs_put_client(clp); +		module_put(THIS_MODULE); +	}  }  /* @@ -1098,10 +1101,25 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)  		set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);  	nfs4_schedule_state_manager(clp);  } +EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); + +/* + * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN + * @clp: client to process + * + * Set the NFS4CLNT_LEASE_EXPIRED state in order to force a + * resend of the SETCLIENTID and hence re-establish the + * callback channel. Then return all existing delegations. + */ +static void nfs40_handle_cb_pathdown(struct nfs_client *clp) +{ +	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); +	nfs_expire_all_delegations(clp); +}  void nfs4_schedule_path_down_recovery(struct nfs_client *clp)  { -	nfs_handle_cb_pathdown(clp); +	nfs40_handle_cb_pathdown(clp);  	nfs4_schedule_state_manager(clp);  } @@ -1135,6 +1153,34 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4  	nfs4_state_mark_reclaim_nograce(clp, state);  	nfs4_schedule_state_manager(clp);  } +EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); + +void nfs_inode_find_state_and_recover(struct inode *inode, +		const nfs4_stateid *stateid) +{ +	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; +	struct nfs_inode *nfsi = NFS_I(inode); +	struct nfs_open_context *ctx; +	struct nfs4_state *state; +	bool found = false; + +	spin_lock(&inode->i_lock); +	list_for_each_entry(ctx, &nfsi->open_files, list) { +		state = ctx->state; +		if (state == NULL) +			continue; +		if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) +			continue; +		if (!nfs4_stateid_match(&state->stateid, stateid)) +			continue; +		nfs4_state_mark_reclaim_nograce(clp, state); +		found = true; +	} +	spin_unlock(&inode->i_lock); +	if (found) +		nfs4_schedule_state_manager(clp); +} +  static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)  { @@ -1173,8 +1219,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_  			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:  				goto out;  			default: -				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", -						__func__, status); +				printk(KERN_ERR "NFS: %s: unhandled error %d. " +					"Zeroing state\n", __func__, status);  			case -ENOMEM:  			case -NFS4ERR_DENIED:  			case -NFS4ERR_RECLAIM_BAD: @@ -1220,8 +1266,9 @@ restart:  				spin_lock(&state->state_lock);  				list_for_each_entry(lock, &state->lock_states, ls_locks) {  					if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) -						printk("%s: Lock reclaim failed!\n", -							__func__); +						pr_warn_ratelimited("NFS: " +							"%s: Lock reclaim " +							"failed!\n", __func__);  				}  				spin_unlock(&state->state_lock);  				nfs4_put_open_state(state); @@ -1230,8 +1277,8 @@ restart:  		}  		switch (status) {  			default: -				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", -						__func__, status); +				printk(KERN_ERR "NFS: %s: unhandled error %d. " +					"Zeroing state\n", __func__, status);  			case -ENOENT:  			case -ENOMEM:  			case -ESTALE: @@ -1239,8 +1286,8 @@ restart:  				 * Open state on this file cannot be recovered  				 * All we can do is revert to using the zero stateid.  				 */ -				memset(state->stateid.data, 0, -					sizeof(state->stateid.data)); +				memset(&state->stateid, 0, +					sizeof(state->stateid));  				/* Mark the file as being 'closed' */  				state->state = 0;  				break; @@ -1418,7 +1465,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)  		case 0:  			break;  		case -NFS4ERR_CB_PATH_DOWN: -			nfs_handle_cb_pathdown(clp); +			nfs40_handle_cb_pathdown(clp);  			break;  		case -NFS4ERR_NO_GRACE:  			nfs4_state_end_reclaim_reboot(clp); @@ -1799,7 +1846,7 @@ static void nfs4_state_manager(struct nfs_client *clp)  	} while (atomic_read(&clp->cl_count) > 1);  	return;  out_error: -	printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" +	pr_warn_ratelimited("NFS: state manager failed on NFSv4 server %s"  			" with error %d\n", clp->cl_hostname, -status);  	nfs4_end_drain_session(clp);  	nfs4_clear_state_manager_bit(clp); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 95e92e43840..c74fdb114b4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -44,6 +44,8 @@  #include <linux/pagemap.h>  #include <linux/proc_fs.h>  #include <linux/kdev_t.h> +#include <linux/module.h> +#include <linux/utsname.h>  #include <linux/sunrpc/clnt.h>  #include <linux/sunrpc/msg_prot.h>  #include <linux/sunrpc/gss_api.h> @@ -271,7 +273,12 @@ static int nfs4_stat_to_errno(int);  				1 /* flags */ + \  				1 /* spa_how */ + \  				0 /* SP4_NONE (for now) */ + \ -				1 /* zero implemetation id array */) +				1 /* implementation id array of size 1 */ + \ +				1 /* nii_domain */ + \ +				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ +				1 /* nii_name */ + \ +				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ +				3 /* nii_date */)  #define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \  				2 /* eir_clientid */ + \  				1 /* eir_sequenceid */ + \ @@ -284,7 +291,11 @@ static int nfs4_stat_to_errno(int);  				/* eir_server_scope<> */ \  				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 + \  				1 /* eir_server_impl_id array length */ + \ -				0 /* ignored eir_server_impl_id contents */) +				1 /* nii_domain */ + \ +				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ +				1 /* nii_name */ + \ +				XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ +				3 /* nii_date */)  #define encode_channel_attrs_maxsz  (6 + 1 /* ca_rdma_ird.len (0) */)  #define decode_channel_attrs_maxsz  (6 + \  				     1 /* ca_rdma_ird.len */ + \ @@ -838,6 +849,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +  				    XDR_UNIT);  #endif /* CONFIG_NFS_V4_1 */ +static unsigned short send_implementation_id = 1; + +module_param(send_implementation_id, ushort, 0644); +MODULE_PARM_DESC(send_implementation_id, +		"Send implementation ID with NFSv4.1 exchange_id"); +  static const umode_t nfs_type2fmt[] = {  	[NF4BAD] = 0,  	[NF4REG] = S_IFREG, @@ -868,15 +885,44 @@ static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)  	return p;  } +static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) +{ +	__be32 *p; + +	p = xdr_reserve_space(xdr, len); +	xdr_encode_opaque_fixed(p, buf, len); +} +  static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)  {  	__be32 *p; -	p = xdr_reserve_space(xdr, 4 + len); -	BUG_ON(p == NULL); +	p = reserve_space(xdr, 4 + len);  	xdr_encode_opaque(p, str, len);  } +static void encode_uint32(struct xdr_stream *xdr, u32 n) +{ +	__be32 *p; + +	p = reserve_space(xdr, 4); +	*p = cpu_to_be32(n); +} + +static void encode_uint64(struct xdr_stream *xdr, u64 n) +{ +	__be32 *p; + +	p = reserve_space(xdr, 8); +	xdr_encode_hyper(p, n); +} + +static void encode_nfs4_seqid(struct xdr_stream *xdr, +		const struct nfs_seqid *seqid) +{ +	encode_uint32(xdr, seqid->sequence->counter); +} +  static void encode_compound_hdr(struct xdr_stream *xdr,  				struct rpc_rqst *req,  				struct compound_hdr *hdr) @@ -889,28 +935,37 @@ static void encode_compound_hdr(struct xdr_stream *xdr,  	 * but this is not required as a MUST for the server to do so. */  	hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; -	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);  	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); -	p = reserve_space(xdr, 4 + hdr->taglen + 8); -	p = xdr_encode_opaque(p, hdr->tag, hdr->taglen); +	encode_string(xdr, hdr->taglen, hdr->tag); +	p = reserve_space(xdr, 8);  	*p++ = cpu_to_be32(hdr->minorversion);  	hdr->nops_p = p;  	*p = cpu_to_be32(hdr->nops);  } +static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op, +		uint32_t replen, +		struct compound_hdr *hdr) +{ +	encode_uint32(xdr, op); +	hdr->nops++; +	hdr->replen += replen; +} +  static void encode_nops(struct compound_hdr *hdr)  {  	BUG_ON(hdr->nops > NFS4_MAX_OPS);  	*hdr->nops_p = htonl(hdr->nops);  } -static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) +static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid)  { -	__be32 *p; +	encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); +} -	p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); -	BUG_ON(p == NULL); -	xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE); +static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) +{ +	encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);  }  static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) @@ -1023,7 +1078,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const  	 * Now we backfill the bitmap and the attribute buffer length.  	 */  	if (len != ((char *)p - (char *)q) + 4) { -		printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n", +		printk(KERN_ERR "NFS: Attr length error, %u != %Zu\n",  				len, ((char *)p - (char *)q) + 4);  		BUG();  	} @@ -1037,46 +1092,33 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const  static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 8); -	*p++ = cpu_to_be32(OP_ACCESS); -	*p = cpu_to_be32(access); -	hdr->nops++; -	hdr->replen += decode_access_maxsz; +	encode_op_hdr(xdr, OP_ACCESS, decode_access_maxsz, hdr); +	encode_uint32(xdr, access);  }  static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(OP_CLOSE); -	*p++ = cpu_to_be32(arg->seqid->sequence->counter); -	xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); -	hdr->nops++; -	hdr->replen += decode_close_maxsz; +	encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr); +	encode_nfs4_seqid(xdr, arg->seqid); +	encode_nfs4_stateid(xdr, arg->stateid);  }  static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 16); -	*p++ = cpu_to_be32(OP_COMMIT); +	encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); +	p = reserve_space(xdr, 12);  	p = xdr_encode_hyper(p, args->offset);  	*p = cpu_to_be32(args->count); -	hdr->nops++; -	hdr->replen += decode_commit_maxsz;  }  static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 8); -	*p++ = cpu_to_be32(OP_CREATE); -	*p = cpu_to_be32(create->ftype); +	encode_op_hdr(xdr, OP_CREATE, decode_create_maxsz, hdr); +	encode_uint32(xdr, create->ftype);  	switch (create->ftype) {  	case NF4LNK: @@ -1096,9 +1138,6 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *  	}  	encode_string(xdr, create->name->len, create->name->name); -	hdr->nops++; -	hdr->replen += decode_create_maxsz; -  	encode_attrs(xdr, create->attrs, create->server);  } @@ -1106,25 +1145,21 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c  {  	__be32 *p; -	p = reserve_space(xdr, 12); -	*p++ = cpu_to_be32(OP_GETATTR); +	encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); +	p = reserve_space(xdr, 8);  	*p++ = cpu_to_be32(1);  	*p = cpu_to_be32(bitmap); -	hdr->nops++; -	hdr->replen += decode_getattr_maxsz;  }  static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 16); -	*p++ = cpu_to_be32(OP_GETATTR); +	encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); +	p = reserve_space(xdr, 12);  	*p++ = cpu_to_be32(2);  	*p++ = cpu_to_be32(bm0);  	*p = cpu_to_be32(bm1); -	hdr->nops++; -	hdr->replen += decode_getattr_maxsz;  }  static void @@ -1134,8 +1169,7 @@ encode_getattr_three(struct xdr_stream *xdr,  {  	__be32 *p; -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_GETATTR); +	encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr);  	if (bm2) {  		p = reserve_space(xdr, 16);  		*p++ = cpu_to_be32(3); @@ -1152,8 +1186,6 @@ encode_getattr_three(struct xdr_stream *xdr,  		*p++ = cpu_to_be32(1);  		*p = cpu_to_be32(bm0);  	} -	hdr->nops++; -	hdr->replen += decode_getattr_maxsz;  }  static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) @@ -1179,23 +1211,13 @@ static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, stru  static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_GETFH); -	hdr->nops++; -	hdr->replen += decode_getfh_maxsz; +	encode_op_hdr(xdr, OP_GETFH, decode_getfh_maxsz, hdr);  }  static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 8 + name->len); -	*p++ = cpu_to_be32(OP_LINK); -	xdr_encode_opaque(p, name->name, name->len); -	hdr->nops++; -	hdr->replen += decode_link_maxsz; +	encode_op_hdr(xdr, OP_LINK, decode_link_maxsz, hdr); +	encode_string(xdr, name->len, name->name);  }  static inline int nfs4_lock_type(struct file_lock *fl, int block) @@ -1232,79 +1254,60 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args  {  	__be32 *p; -	p = reserve_space(xdr, 32); -	*p++ = cpu_to_be32(OP_LOCK); +	encode_op_hdr(xdr, OP_LOCK, decode_lock_maxsz, hdr); +	p = reserve_space(xdr, 28);  	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));  	*p++ = cpu_to_be32(args->reclaim);  	p = xdr_encode_hyper(p, args->fl->fl_start);  	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));  	*p = cpu_to_be32(args->new_lock_owner);  	if (args->new_lock_owner){ -		p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); -		*p++ = cpu_to_be32(args->open_seqid->sequence->counter); -		p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); -		*p++ = cpu_to_be32(args->lock_seqid->sequence->counter); +		encode_nfs4_seqid(xdr, args->open_seqid); +		encode_nfs4_stateid(xdr, args->open_stateid); +		encode_nfs4_seqid(xdr, args->lock_seqid);  		encode_lockowner(xdr, &args->lock_owner);  	}  	else { -		p = reserve_space(xdr, NFS4_STATEID_SIZE+4); -		p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); -		*p = cpu_to_be32(args->lock_seqid->sequence->counter); +		encode_nfs4_stateid(xdr, args->lock_stateid); +		encode_nfs4_seqid(xdr, args->lock_seqid);  	} -	hdr->nops++; -	hdr->replen += decode_lock_maxsz;  }  static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 24); -	*p++ = cpu_to_be32(OP_LOCKT); +	encode_op_hdr(xdr, OP_LOCKT, decode_lockt_maxsz, hdr); +	p = reserve_space(xdr, 20);  	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));  	p = xdr_encode_hyper(p, args->fl->fl_start);  	p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));  	encode_lockowner(xdr, &args->lock_owner); -	hdr->nops++; -	hdr->replen += decode_lockt_maxsz;  }  static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16); -	*p++ = cpu_to_be32(OP_LOCKU); -	*p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); -	*p++ = cpu_to_be32(args->seqid->sequence->counter); -	p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); +	encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr); +	encode_uint32(xdr, nfs4_lock_type(args->fl, 0)); +	encode_nfs4_seqid(xdr, args->seqid); +	encode_nfs4_stateid(xdr, args->stateid); +	p = reserve_space(xdr, 16);  	p = xdr_encode_hyper(p, args->fl->fl_start);  	xdr_encode_hyper(p, nfs4_lock_length(args->fl)); -	hdr->nops++; -	hdr->replen += decode_locku_maxsz;  }  static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_RELEASE_LOCKOWNER); +	encode_op_hdr(xdr, OP_RELEASE_LOCKOWNER, decode_release_lockowner_maxsz, hdr);  	encode_lockowner(xdr, lowner); -	hdr->nops++; -	hdr->replen += decode_release_lockowner_maxsz;  }  static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)  { -	int len = name->len; -	__be32 *p; - -	p = reserve_space(xdr, 8 + len); -	*p++ = cpu_to_be32(OP_LOOKUP); -	xdr_encode_opaque(p, name->name, len); -	hdr->nops++; -	hdr->replen += decode_lookup_maxsz; +	encode_op_hdr(xdr, OP_LOOKUP, decode_lookup_maxsz, hdr); +	encode_string(xdr, name->len, name->name);  }  static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) @@ -1335,9 +1338,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena   * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,   * owner 4 = 32   */ -	p = reserve_space(xdr, 8); -	*p++ = cpu_to_be32(OP_OPEN); -	*p = cpu_to_be32(arg->seqid->sequence->counter); +	encode_nfs4_seqid(xdr, arg->seqid);  	encode_share_access(xdr, arg->fmode);  	p = reserve_space(xdr, 32);  	p = xdr_encode_hyper(p, arg->clientid); @@ -1437,14 +1438,15 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc  {  	__be32 *p; -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); -	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); +	p = reserve_space(xdr, 4); +	*p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); +	encode_nfs4_stateid(xdr, stateid);  	encode_string(xdr, name->len, name->name);  }  static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)  { +	encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr);  	encode_openhdr(xdr, arg);  	encode_opentype(xdr, arg);  	switch (arg->claim) { @@ -1460,88 +1462,64 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg,  	default:  		BUG();  	} -	hdr->nops++; -	hdr->replen += decode_open_maxsz;  }  static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); -	*p++ = cpu_to_be32(OP_OPEN_CONFIRM); -	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); -	*p = cpu_to_be32(arg->seqid->sequence->counter); -	hdr->nops++; -	hdr->replen += decode_open_confirm_maxsz; +	encode_op_hdr(xdr, OP_OPEN_CONFIRM, decode_open_confirm_maxsz, hdr); +	encode_nfs4_stateid(xdr, arg->stateid); +	encode_nfs4_seqid(xdr, arg->seqid);  }  static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); -	*p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); -	p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); -	*p = cpu_to_be32(arg->seqid->sequence->counter); +	encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr); +	encode_nfs4_stateid(xdr, arg->stateid); +	encode_nfs4_seqid(xdr, arg->seqid);  	encode_share_access(xdr, arg->fmode); -	hdr->nops++; -	hdr->replen += decode_open_downgrade_maxsz;  }  static void  encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)  { -	int len = fh->size; -	__be32 *p; - -	p = reserve_space(xdr, 8 + len); -	*p++ = cpu_to_be32(OP_PUTFH); -	xdr_encode_opaque(p, fh->data, len); -	hdr->nops++; -	hdr->replen += decode_putfh_maxsz; +	encode_op_hdr(xdr, OP_PUTFH, decode_putfh_maxsz, hdr); +	encode_string(xdr, fh->size, fh->data);  }  static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_PUTROOTFH); -	hdr->nops++; -	hdr->replen += decode_putrootfh_maxsz; +	encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);  } -static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid) +static void encode_open_stateid(struct xdr_stream *xdr, +		const struct nfs_open_context *ctx, +		const struct nfs_lock_context *l_ctx, +		fmode_t fmode, +		int zero_seqid)  {  	nfs4_stateid stateid; -	__be32 *p; -	p = reserve_space(xdr, NFS4_STATEID_SIZE);  	if (ctx->state != NULL) { -		nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); +		nfs4_select_rw_stateid(&stateid, ctx->state, +				fmode, l_ctx->lockowner, l_ctx->pid);  		if (zero_seqid) -			stateid.stateid.seqid = 0; -		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); +			stateid.seqid = 0; +		encode_nfs4_stateid(xdr, &stateid);  	} else -		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); +		encode_nfs4_stateid(xdr, &zero_stateid);  }  static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_READ); - -	encode_stateid(xdr, args->context, args->lock_context, -		       hdr->minorversion); +	encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr); +	encode_open_stateid(xdr, args->context, args->lock_context, +			FMODE_READ, hdr->minorversion);  	p = reserve_space(xdr, 12);  	p = xdr_encode_hyper(p, args->offset);  	*p = cpu_to_be32(args->count); -	hdr->nops++; -	hdr->replen += decode_read_maxsz;  }  static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) @@ -1551,7 +1529,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg  		FATTR4_WORD1_MOUNTED_ON_FILEID,  	};  	uint32_t dircount = readdir->count >> 1; -	__be32 *p; +	__be32 *p, verf[2];  	if (readdir->plus) {  		attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE| @@ -1566,80 +1544,54 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg  	if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))  		attrs[0] |= FATTR4_WORD0_FILEID; -	p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); -	*p++ = cpu_to_be32(OP_READDIR); -	p = xdr_encode_hyper(p, readdir->cookie); -	p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); +	encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); +	encode_uint64(xdr, readdir->cookie); +	encode_nfs4_verifier(xdr, &readdir->verifier); +	p = reserve_space(xdr, 20);  	*p++ = cpu_to_be32(dircount);  	*p++ = cpu_to_be32(readdir->count);  	*p++ = cpu_to_be32(2);  	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);  	*p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); -	hdr->nops++; -	hdr->replen += decode_readdir_maxsz; +	memcpy(verf, readdir->verifier.data, sizeof(verf));  	dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",  			__func__,  			(unsigned long long)readdir->cookie, -			((u32 *)readdir->verifier.data)[0], -			((u32 *)readdir->verifier.data)[1], +			verf[0], verf[1],  			attrs[0] & readdir->bitmask[0],  			attrs[1] & readdir->bitmask[1]);  }  static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_READLINK); -	hdr->nops++; -	hdr->replen += decode_readlink_maxsz; +	encode_op_hdr(xdr, OP_READLINK, decode_readlink_maxsz, hdr);  }  static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 8 + name->len); -	*p++ = cpu_to_be32(OP_REMOVE); -	xdr_encode_opaque(p, name->name, name->len); -	hdr->nops++; -	hdr->replen += decode_remove_maxsz; +	encode_op_hdr(xdr, OP_REMOVE, decode_remove_maxsz, hdr); +	encode_string(xdr, name->len, name->name);  }  static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_RENAME); +	encode_op_hdr(xdr, OP_RENAME, decode_rename_maxsz, hdr);  	encode_string(xdr, oldname->len, oldname->name);  	encode_string(xdr, newname->len, newname->name); -	hdr->nops++; -	hdr->replen += decode_rename_maxsz;  } -static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr) +static void encode_renew(struct xdr_stream *xdr, clientid4 clid, +			 struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 12); -	*p++ = cpu_to_be32(OP_RENEW); -	xdr_encode_hyper(p, client_stateid->cl_clientid); -	hdr->nops++; -	hdr->replen += decode_renew_maxsz; +	encode_op_hdr(xdr, OP_RENEW, decode_renew_maxsz, hdr); +	encode_uint64(xdr, clid);  }  static void  encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_RESTOREFH); -	hdr->nops++; -	hdr->replen += decode_restorefh_maxsz; +	encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr);  }  static void @@ -1647,9 +1599,8 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun  {  	__be32 *p; -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(OP_SETATTR); -	xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); +	encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr); +	encode_nfs4_stateid(xdr, &zero_stateid);  	p = reserve_space(xdr, 2*4);  	*p++ = cpu_to_be32(1);  	*p = cpu_to_be32(FATTR4_WORD0_ACL); @@ -1657,30 +1608,18 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun  	p = reserve_space(xdr, 4);  	*p = cpu_to_be32(arg->acl_len);  	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); -	hdr->nops++; -	hdr->replen += decode_setacl_maxsz;  }  static void  encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_SAVEFH); -	hdr->nops++; -	hdr->replen += decode_savefh_maxsz; +	encode_op_hdr(xdr, OP_SAVEFH, decode_savefh_maxsz, hdr);  }  static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(OP_SETATTR); -	xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); -	hdr->nops++; -	hdr->replen += decode_setattr_maxsz; +	encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); +	encode_nfs4_stateid(xdr, &arg->stateid);  	encode_attrs(xdr, arg->iap, server);  } @@ -1688,9 +1627,8 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie  {  	__be32 *p; -	p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE); -	*p++ = cpu_to_be32(OP_SETCLIENTID); -	xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); +	encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr); +	encode_nfs4_verifier(xdr, setclientid->sc_verifier);  	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);  	p = reserve_space(xdr, 4); @@ -1699,31 +1637,23 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie  	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);  	p = reserve_space(xdr, 4);  	*p = cpu_to_be32(setclientid->sc_cb_ident); -	hdr->nops++; -	hdr->replen += decode_setclientid_maxsz;  }  static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); -	*p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); -	p = xdr_encode_hyper(p, arg->clientid); -	xdr_encode_opaque_fixed(p, arg->confirm.data, NFS4_VERIFIER_SIZE); -	hdr->nops++; -	hdr->replen += decode_setclientid_confirm_maxsz; +	encode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM, +			decode_setclientid_confirm_maxsz, hdr); +	encode_uint64(xdr, arg->clientid); +	encode_nfs4_verifier(xdr, &arg->confirm);  }  static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)  {  	__be32 *p; -	p = reserve_space(xdr, 4); -	*p = cpu_to_be32(OP_WRITE); - -	encode_stateid(xdr, args->context, args->lock_context, -		       hdr->minorversion); +	encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr); +	encode_open_stateid(xdr, args->context, args->lock_context, +			FMODE_WRITE, hdr->minorversion);  	p = reserve_space(xdr, 16);  	p = xdr_encode_hyper(p, args->offset); @@ -1731,32 +1661,18 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg  	*p = cpu_to_be32(args->count);  	xdr_write_pages(xdr, args->pages, args->pgbase, args->count); -	hdr->nops++; -	hdr->replen += decode_write_maxsz;  }  static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); - -	*p++ = cpu_to_be32(OP_DELEGRETURN); -	xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); -	hdr->nops++; -	hdr->replen += decode_delegreturn_maxsz; +	encode_op_hdr(xdr, OP_DELEGRETURN, decode_delegreturn_maxsz, hdr); +	encode_nfs4_stateid(xdr, stateid);  }  static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)  { -	int len = name->len; -	__be32 *p; - -	p = reserve_space(xdr, 8 + len); -	*p++ = cpu_to_be32(OP_SECINFO); -	xdr_encode_opaque(p, name->name, len); -	hdr->nops++; -	hdr->replen += decode_secinfo_maxsz; +	encode_op_hdr(xdr, OP_SECINFO, decode_secinfo_maxsz, hdr); +	encode_string(xdr, name->len, name->name);  }  #if defined(CONFIG_NFS_V4_1) @@ -1766,19 +1682,39 @@ static void encode_exchange_id(struct xdr_stream *xdr,  			       struct compound_hdr *hdr)  {  	__be32 *p; +	char impl_name[NFS4_OPAQUE_LIMIT]; +	int len = 0; -	p = reserve_space(xdr, 4 + sizeof(args->verifier->data)); -	*p++ = cpu_to_be32(OP_EXCHANGE_ID); -	xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); +	encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); +	encode_nfs4_verifier(xdr, args->verifier);  	encode_string(xdr, args->id_len, args->id);  	p = reserve_space(xdr, 12);  	*p++ = cpu_to_be32(args->flags);  	*p++ = cpu_to_be32(0);	/* zero length state_protect4_a */ -	*p = cpu_to_be32(0);	/* zero length implementation id array */ -	hdr->nops++; -	hdr->replen += decode_exchange_id_maxsz; + +	if (send_implementation_id && +	    sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 && +	    sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) +		<= NFS4_OPAQUE_LIMIT + 1) +		len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s", +			       utsname()->sysname, utsname()->release, +			       utsname()->version, utsname()->machine); + +	if (len > 0) { +		*p = cpu_to_be32(1);	/* implementation id array length=1 */ + +		encode_string(xdr, +			sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) - 1, +			CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN); +		encode_string(xdr, len, impl_name); +		/* just send zeros for nii_date - the date is in nii_name */ +		p = reserve_space(xdr, 12); +		p = xdr_encode_hyper(p, 0); +		*p = cpu_to_be32(0); +	} else +		*p = cpu_to_be32(0);	/* implementation id array length=0 */  }  static void encode_create_session(struct xdr_stream *xdr, @@ -1801,8 +1737,8 @@ static void encode_create_session(struct xdr_stream *xdr,  	len = scnprintf(machine_name, sizeof(machine_name), "%s",  			clp->cl_ipaddr); -	p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); -	*p++ = cpu_to_be32(OP_CREATE_SESSION); +	encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); +	p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12);  	p = xdr_encode_hyper(p, clp->cl_clientid);  	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */  	*p++ = cpu_to_be32(args->flags);			/*flags */ @@ -1835,33 +1771,22 @@ static void encode_create_session(struct xdr_stream *xdr,  	*p++ = cpu_to_be32(0);				/* UID */  	*p++ = cpu_to_be32(0);				/* GID */  	*p = cpu_to_be32(0);				/* No more gids */ -	hdr->nops++; -	hdr->replen += decode_create_session_maxsz;  }  static void encode_destroy_session(struct xdr_stream *xdr,  				   struct nfs4_session *session,  				   struct compound_hdr *hdr)  { -	__be32 *p; -	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN); -	*p++ = cpu_to_be32(OP_DESTROY_SESSION); -	xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); -	hdr->nops++; -	hdr->replen += decode_destroy_session_maxsz; +	encode_op_hdr(xdr, OP_DESTROY_SESSION, decode_destroy_session_maxsz, hdr); +	encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);  }  static void encode_reclaim_complete(struct xdr_stream *xdr,  				    struct nfs41_reclaim_complete_args *args,  				    struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 8); -	*p++ = cpu_to_be32(OP_RECLAIM_COMPLETE); -	*p++ = cpu_to_be32(args->one_fs); -	hdr->nops++; -	hdr->replen += decode_reclaim_complete_maxsz; +	encode_op_hdr(xdr, OP_RECLAIM_COMPLETE, decode_reclaim_complete_maxsz, hdr); +	encode_uint32(xdr, args->one_fs);  }  #endif /* CONFIG_NFS_V4_1 */ @@ -1883,8 +1808,7 @@ static void encode_sequence(struct xdr_stream *xdr,  	WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);  	slot = tp->slots + args->sa_slotid; -	p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16); -	*p++ = cpu_to_be32(OP_SEQUENCE); +	encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);  	/*  	 * Sessionid + seqid + slotid + max slotid + cache_this @@ -1898,13 +1822,12 @@ static void encode_sequence(struct xdr_stream *xdr,  		((u32 *)session->sess_id.data)[3],  		slot->seq_nr, args->sa_slotid,  		tp->highest_used_slotid, args->sa_cache_this); +	p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);  	p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);  	*p++ = cpu_to_be32(slot->seq_nr);  	*p++ = cpu_to_be32(args->sa_slotid);  	*p++ = cpu_to_be32(tp->highest_used_slotid);  	*p = cpu_to_be32(args->sa_cache_this); -	hdr->nops++; -	hdr->replen += decode_sequence_maxsz;  #endif /* CONFIG_NFS_V4_1 */  } @@ -1919,14 +1842,12 @@ encode_getdevicelist(struct xdr_stream *xdr,  		.data = "dummmmmy",  	}; -	p = reserve_space(xdr, 20); -	*p++ = cpu_to_be32(OP_GETDEVICELIST); +	encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr); +	p = reserve_space(xdr, 16);  	*p++ = cpu_to_be32(args->layoutclass);  	*p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);  	xdr_encode_hyper(p, 0ULL);                          /* cookie */  	encode_nfs4_verifier(xdr, &dummy); -	hdr->nops++; -	hdr->replen += decode_getdevicelist_maxsz;  }  static void @@ -1936,15 +1857,13 @@ encode_getdeviceinfo(struct xdr_stream *xdr,  {  	__be32 *p; -	p = reserve_space(xdr, 16 + NFS4_DEVICEID4_SIZE); -	*p++ = cpu_to_be32(OP_GETDEVICEINFO); +	encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); +	p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);  	p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,  				    NFS4_DEVICEID4_SIZE);  	*p++ = cpu_to_be32(args->pdev->layout_type);  	*p++ = cpu_to_be32(args->pdev->pglen);		/* gdia_maxcount */  	*p++ = cpu_to_be32(0);				/* bitmap length 0 */ -	hdr->nops++; -	hdr->replen += decode_getdeviceinfo_maxsz;  }  static void @@ -1954,16 +1873,16 @@ encode_layoutget(struct xdr_stream *xdr,  {  	__be32 *p; -	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(OP_LAYOUTGET); +	encode_op_hdr(xdr, OP_LAYOUTGET, decode_layoutget_maxsz, hdr); +	p = reserve_space(xdr, 36);  	*p++ = cpu_to_be32(0);     /* Signal layout available */  	*p++ = cpu_to_be32(args->type);  	*p++ = cpu_to_be32(args->range.iomode);  	p = xdr_encode_hyper(p, args->range.offset);  	p = xdr_encode_hyper(p, args->range.length);  	p = xdr_encode_hyper(p, args->minlength); -	p = xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE); -	*p = cpu_to_be32(args->maxcount); +	encode_nfs4_stateid(xdr, &args->stateid); +	encode_uint32(xdr, args->maxcount);  	dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n",  		__func__, @@ -1972,8 +1891,6 @@ encode_layoutget(struct xdr_stream *xdr,  		(unsigned long)args->range.offset,  		(unsigned long)args->range.length,  		args->maxcount); -	hdr->nops++; -	hdr->replen += decode_layoutget_maxsz;  }  static int @@ -1987,13 +1904,14 @@ encode_layoutcommit(struct xdr_stream *xdr,  	dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,  		NFS_SERVER(args->inode)->pnfs_curr_ld->id); -	p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(OP_LAYOUTCOMMIT); +	encode_op_hdr(xdr, OP_LAYOUTCOMMIT, decode_layoutcommit_maxsz, hdr); +	p = reserve_space(xdr, 20);  	/* Only whole file layouts */  	p = xdr_encode_hyper(p, 0); /* offset */  	p = xdr_encode_hyper(p, args->lastbytewritten + 1);	/* length */ -	*p++ = cpu_to_be32(0); /* reclaim */ -	p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); +	*p = cpu_to_be32(0); /* reclaim */ +	encode_nfs4_stateid(xdr, &args->stateid); +	p = reserve_space(xdr, 20);  	*p++ = cpu_to_be32(1); /* newoffset = TRUE */  	p = xdr_encode_hyper(p, args->lastbytewritten);  	*p++ = cpu_to_be32(0); /* Never send time_modify_changed */ @@ -2002,13 +1920,9 @@ encode_layoutcommit(struct xdr_stream *xdr,  	if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)  		NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(  			NFS_I(inode)->layout, xdr, args); -	else { -		p = reserve_space(xdr, 4); -		*p = cpu_to_be32(0); /* no layout-type payload */ -	} +	else +		encode_uint32(xdr, 0); /* no layout-type payload */ -	hdr->nops++; -	hdr->replen += decode_layoutcommit_maxsz;  	return 0;  } @@ -2019,27 +1933,23 @@ encode_layoutreturn(struct xdr_stream *xdr,  {  	__be32 *p; -	p = reserve_space(xdr, 20); -	*p++ = cpu_to_be32(OP_LAYOUTRETURN); +	encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr); +	p = reserve_space(xdr, 16);  	*p++ = cpu_to_be32(0);		/* reclaim. always 0 for now */  	*p++ = cpu_to_be32(args->layout_type);  	*p++ = cpu_to_be32(IOMODE_ANY);  	*p = cpu_to_be32(RETURN_FILE); -	p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); +	p = reserve_space(xdr, 16);  	p = xdr_encode_hyper(p, 0);  	p = xdr_encode_hyper(p, NFS4_MAX_UINT64);  	spin_lock(&args->inode->i_lock); -	xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE); +	encode_nfs4_stateid(xdr, &args->stateid);  	spin_unlock(&args->inode->i_lock);  	if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {  		NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(  			NFS_I(args->inode)->layout, xdr, args); -	} else { -		p = reserve_space(xdr, 4); -		*p = cpu_to_be32(0); -	} -	hdr->nops++; -	hdr->replen += decode_layoutreturn_maxsz; +	} else +		encode_uint32(xdr, 0);  }  static int @@ -2047,12 +1957,8 @@ encode_secinfo_no_name(struct xdr_stream *xdr,  		       const struct nfs41_secinfo_no_name_args *args,  		       struct compound_hdr *hdr)  { -	__be32 *p; -	p = reserve_space(xdr, 8); -	*p++ = cpu_to_be32(OP_SECINFO_NO_NAME); -	*p++ = cpu_to_be32(args->style); -	hdr->nops++; -	hdr->replen += decode_secinfo_no_name_maxsz; +	encode_op_hdr(xdr, OP_SECINFO_NO_NAME, decode_secinfo_no_name_maxsz, hdr); +	encode_uint32(xdr, args->style);  	return 0;  } @@ -2060,26 +1966,17 @@ static void encode_test_stateid(struct xdr_stream *xdr,  				struct nfs41_test_stateid_args *args,  				struct compound_hdr *hdr)  { -	__be32 *p; - -	p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(OP_TEST_STATEID); -	*p++ = cpu_to_be32(1); -	xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); -	hdr->nops++; -	hdr->replen += decode_test_stateid_maxsz; +	encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr); +	encode_uint32(xdr, 1); +	encode_nfs4_stateid(xdr, args->stateid);  }  static void encode_free_stateid(struct xdr_stream *xdr,  				struct nfs41_free_stateid_args *args,  				struct compound_hdr *hdr)  { -	__be32 *p; -	p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE); -	*p++ = cpu_to_be32(OP_FREE_STATEID); -	xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); -	hdr->nops++; -	hdr->replen += decode_free_stateid_maxsz; +	encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr); +	encode_nfs4_stateid(xdr, args->stateid);  }  #endif /* CONFIG_NFS_V4_1 */ @@ -2522,7 +2419,6 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,  	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,  		args->acl_pages, args->acl_pgbase, args->acl_len); -	xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE);  	encode_nops(&hdr);  } @@ -2634,6 +2530,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,  	encode_sequence(xdr, &args->seq_args, &hdr);  	encode_putfh(xdr, args->fhandle, &hdr);  	encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS| +			   FATTR4_WORD0_FH_EXPIRE_TYPE|  			   FATTR4_WORD0_LINK_SUPPORT|  			   FATTR4_WORD0_SYMLINK_SUPPORT|  			   FATTR4_WORD0_ACLSUPPORT, &hdr); @@ -2651,7 +2548,7 @@ static void nfs4_xdr_enc_renew(struct rpc_rqst *req, struct xdr_stream *xdr,  	};  	encode_compound_hdr(xdr, req, &hdr); -	encode_renew(xdr, clp, &hdr); +	encode_renew(xdr, clp->cl_clientid, &hdr);  	encode_nops(&hdr);  } @@ -3181,6 +3078,28 @@ out_overflow:  	return -EIO;  } +static int decode_attr_fh_expire_type(struct xdr_stream *xdr, +				      uint32_t *bitmap, uint32_t *type) +{ +	__be32 *p; + +	*type = 0; +	if (unlikely(bitmap[0] & (FATTR4_WORD0_FH_EXPIRE_TYPE - 1U))) +		return -EIO; +	if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) { +		p = xdr_inline_decode(xdr, 4); +		if (unlikely(!p)) +			goto out_overflow; +		*type = be32_to_cpup(p); +		bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE; +	} +	dprintk("%s: expire type=0x%x\n", __func__, *type); +	return 0; +out_overflow: +	print_overflow_msg(__func__, xdr); +	return -EIO; +} +  static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)  {  	__be32 *p; @@ -3514,16 +3433,17 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)  	n = be32_to_cpup(p);  	if (n == 0)  		goto root_path; -	dprintk("path "); +	dprintk("pathname4: ");  	path->ncomponents = 0;  	while (path->ncomponents < n) {  		struct nfs4_string *component = &path->components[path->ncomponents];  		status = decode_opaque_inline(xdr, &component->len, &component->data);  		if (unlikely(status != 0))  			goto out_eio; -		if (path->ncomponents != n) -			dprintk("/"); -		dprintk("%s", component->data); +		ifdebug (XDR) +			pr_cont("%s%.*s ", +				(path->ncomponents != n ? "/ " : ""), +				component->len, component->data);  		if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)  			path->ncomponents++;  		else { @@ -3532,14 +3452,13 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)  		}  	}  out: -	dprintk("\n");  	return status;  root_path:  /* a root pathname is sent as a zero component4 */  	path->ncomponents = 1;  	path->components[0].len=0;  	path->components[0].data=NULL; -	dprintk("path /\n"); +	dprintk("pathname4: /\n");  	goto out;  out_eio:  	dprintk(" status %d", status); @@ -3561,7 +3480,11 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st  	status = 0;  	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))  		goto out; -	dprintk("%s: fsroot ", __func__); +	status = -EIO; +	/* Ignore borken servers that return unrequested attrs */ +	if (unlikely(res == NULL)) +		goto out; +	dprintk("%s: fsroot:\n", __func__);  	status = decode_pathname(xdr, &res->fs_path);  	if (unlikely(status != 0))  		goto out; @@ -3582,7 +3505,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st  		m = be32_to_cpup(p);  		loc->nservers = 0; -		dprintk("%s: servers ", __func__); +		dprintk("%s: servers:\n", __func__);  		while (loc->nservers < m) {  			struct nfs4_string *server = &loc->servers[loc->nservers];  			status = decode_opaque_inline(xdr, &server->len, &server->data); @@ -3614,7 +3537,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st  			res->nlocations++;  	}  	if (res->nlocations != 0) -		status = NFS_ATTR_FATTR_V4_REFERRAL; +		status = NFS_ATTR_FATTR_V4_LOCATIONS;  out:  	dprintk("%s: fs_locations done, error = %d\n", __func__, status);  	return status; @@ -4158,7 +4081,7 @@ static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)  static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)  { -	return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); +	return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);  }  static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) @@ -4175,7 +4098,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)  static int decode_verifier(struct xdr_stream *xdr, void *verifier)  { -	return decode_opaque_fixed(xdr, verifier, 8); +	return decode_opaque_fixed(xdr, verifier, NFS4_VERIFIER_SIZE);  }  static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) @@ -4225,6 +4148,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re  		goto xdr_error;  	if ((status = decode_attr_supported(xdr, bitmap, res->attr_bitmask)) != 0)  		goto xdr_error; +	if ((status = decode_attr_fh_expire_type(xdr, bitmap, +						 &res->fh_expire_type)) != 0) +		goto xdr_error;  	if ((status = decode_attr_link_support(xdr, bitmap, &res->has_links)) != 0)  		goto xdr_error;  	if ((status = decode_attr_symlink_support(xdr, bitmap, &res->has_symlinks)) != 0) @@ -4295,6 +4221,7 @@ xdr_error:  static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,  		struct nfs_fattr *fattr, struct nfs_fh *fh, +		struct nfs4_fs_locations *fs_loc,  		const struct nfs_server *server)  {  	int status; @@ -4342,9 +4269,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,  		goto xdr_error;  	fattr->valid |= status; -	status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, -						struct nfs4_fs_locations, -						fattr)); +	status = decode_attr_fs_locations(xdr, bitmap, fs_loc);  	if (status < 0)  		goto xdr_error;  	fattr->valid |= status; @@ -4408,7 +4333,8 @@ xdr_error:  }  static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, -		struct nfs_fh *fh, const struct nfs_server *server) +		struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, +		const struct nfs_server *server)  {  	__be32 *savep;  	uint32_t attrlen, @@ -4427,7 +4353,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat  	if (status < 0)  		goto xdr_error; -	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server); +	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);  	if (status < 0)  		goto xdr_error; @@ -4440,7 +4366,7 @@ xdr_error:  static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,  		const struct nfs_server *server)  { -	return decode_getfattr_generic(xdr, fattr, NULL, server); +	return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);  }  /* @@ -4464,8 +4390,8 @@ static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,  		return 0;  	}  	if (num > 1) -		printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " -			"per filesystem not supported\n", __func__); +		printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout " +			"drivers per filesystem not supported\n", __func__);  	/* Decode and set first layout type, move xdr->p past unused types */  	p = xdr_inline_decode(xdr, num * 4); @@ -4864,17 +4790,16 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n  	size_t		hdrlen;  	u32		recvd, pglen = rcvbuf->page_len;  	int		status; +	__be32		verf[2];  	status = decode_op_hdr(xdr, OP_READDIR);  	if (!status)  		status = decode_verifier(xdr, readdir->verifier.data);  	if (unlikely(status))  		return status; +	memcpy(verf, readdir->verifier.data, sizeof(verf));  	dprintk("%s: verifier = %08x:%08x\n", -			__func__, -			((u32 *)readdir->verifier.data)[0], -			((u32 *)readdir->verifier.data)[1]); - +			__func__, verf[0], verf[1]);  	hdrlen = (char *) xdr->p - (char *) iov->iov_base;  	recvd = rcvbuf->len - hdrlen; @@ -5121,7 +5046,7 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)  		goto out_overflow;  	res->count = be32_to_cpup(p++);  	res->verf->committed = be32_to_cpup(p++); -	memcpy(res->verf->verifier, p, 8); +	memcpy(res->verf->verifier, p, NFS4_VERIFIER_SIZE);  	return 0;  out_overflow:  	print_overflow_msg(__func__, xdr); @@ -5215,6 +5140,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,  	char *dummy_str;  	int status;  	struct nfs_client *clp = res->client; +	uint32_t impl_id_count;  	status = decode_op_hdr(xdr, OP_EXCHANGE_ID);  	if (status) @@ -5256,11 +5182,38 @@ static int decode_exchange_id(struct xdr_stream *xdr,  	memcpy(res->server_scope->server_scope, dummy_str, dummy);  	res->server_scope->server_scope_sz = dummy; -	/* Throw away Implementation id array */ -	status = decode_opaque_inline(xdr, &dummy, &dummy_str); -	if (unlikely(status)) -		return status; +	/* Implementation Id */ +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	impl_id_count = be32_to_cpup(p++); +	if (impl_id_count) { +		/* nii_domain */ +		status = decode_opaque_inline(xdr, &dummy, &dummy_str); +		if (unlikely(status)) +			return status; +		if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) +			return -EIO; +		memcpy(res->impl_id->domain, dummy_str, dummy); + +		/* nii_name */ +		status = decode_opaque_inline(xdr, &dummy, &dummy_str); +		if (unlikely(status)) +			return status; +		if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) +			return -EIO; +		memcpy(res->impl_id->name, dummy_str, dummy); + +		/* nii_date */ +		p = xdr_inline_decode(xdr, 12); +		if (unlikely(!p)) +			goto out_overflow; +		p = xdr_decode_hyper(p, &res->impl_id->date.seconds); +		res->impl_id->date.nseconds = be32_to_cpup(p); + +		/* if there's more than one entry, ignore the rest */ +	}  	return 0;  out_overflow:  	print_overflow_msg(__func__, xdr); @@ -5286,8 +5239,8 @@ static int decode_chan_attrs(struct xdr_stream *xdr,  	attrs->max_reqs = be32_to_cpup(p++);  	nr_attrs = be32_to_cpup(p);  	if (unlikely(nr_attrs > 1)) { -		printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", -			__func__, nr_attrs); +		printk(KERN_WARNING "NFS: %s: Invalid rdma channel attrs " +			"count %u\n", __func__, nr_attrs);  		return -EINVAL;  	}  	if (nr_attrs == 1) { @@ -5437,14 +5390,14 @@ static int decode_getdevicelist(struct xdr_stream *xdr,  	p += 2;  	/* Read verifier */ -	p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); +	p = xdr_decode_opaque_fixed(p, verftemp.verifier, NFS4_VERIFIER_SIZE);  	res->num_devs = be32_to_cpup(p);  	dprintk("%s: num_dev %d\n", __func__, res->num_devs);  	if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { -		printk(KERN_ERR "%s too many result dev_num %u\n", +		printk(KERN_ERR "NFS: %s too many result dev_num %u\n",  				__func__, res->num_devs);  		return -EIO;  	} @@ -5538,11 +5491,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,  	status = decode_op_hdr(xdr, OP_LAYOUTGET);  	if (status)  		return status; -	p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); +	p = xdr_inline_decode(xdr, 4); +	if (unlikely(!p)) +		goto out_overflow; +	res->return_on_close = be32_to_cpup(p); +	decode_stateid(xdr, &res->stateid); +	p = xdr_inline_decode(xdr, 4);  	if (unlikely(!p))  		goto out_overflow; -	res->return_on_close = be32_to_cpup(p++); -	p = xdr_decode_opaque_fixed(p, res->stateid.data, NFS4_STATEID_SIZE);  	layout_count = be32_to_cpup(p);  	if (!layout_count) {  		dprintk("%s: server responded with empty layout array\n", @@ -5667,7 +5623,8 @@ static int decode_test_stateid(struct xdr_stream *xdr,  	if (unlikely(!p))  		goto out_overflow;  	res->status = be32_to_cpup(p++); -	return res->status; + +	return status;  out_overflow:  	print_overflow_msg(__func__, xdr);  out: @@ -6032,6 +5989,10 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,  	struct compound_hdr hdr;  	int status; +	if (res->acl_scratch != NULL) { +		void *p = page_address(res->acl_scratch); +		xdr_set_scratch_buffer(xdr, p, PAGE_SIZE); +	}  	status = decode_compound_hdr(xdr, &hdr);  	if (status)  		goto out; @@ -6580,8 +6541,9 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,  	if (status)  		goto out;  	xdr_enter_page(xdr, PAGE_SIZE); -	status = decode_getfattr(xdr, &res->fs_locations->fattr, -				 res->fs_locations->server); +	status = decode_getfattr_generic(xdr, &res->fs_locations->fattr, +					 NULL, res->fs_locations, +					 res->fs_locations->server);  out:  	return status;  } @@ -6961,7 +6923,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,  		goto out_overflow;  	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, -					entry->server) < 0) +				  NULL, entry->server) < 0)  		goto out_overflow;  	if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)  		entry->ino = entry->fattr->mounted_on_fileid; @@ -7109,7 +7071,7 @@ struct rpc_procinfo	nfs4_procedures[] = {  #endif /* CONFIG_NFS_V4_1 */  }; -struct rpc_version		nfs_version4 = { +const struct rpc_version nfs_version4 = {  	.number			= 4,  	.nrprocs		= ARRAY_SIZE(nfs4_procedures),  	.procs			= nfs4_procedures diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index c4744e1d513..cd3c910d2d1 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -104,7 +104,7 @@ static char nfs_export_path[NFS_MAXPATHLEN + 1] __initdata = "";  /* server:export path string passed to super.c */  static char nfs_root_device[NFS_MAXPATHLEN + 1] __initdata = ""; -#ifdef RPC_DEBUG +#ifdef NFS_DEBUG  /*   * When the "nfsrootdebug" kernel command line option is specified,   * enable debugging messages for NFSROOT. diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 55d01280a60..4bff4a3dab4 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -137,6 +137,7 @@ static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,  	struct objio_dev_ent *ode;  	struct osd_dev *od;  	struct osd_dev_info odi; +	bool retry_flag = true;  	int err;  	ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); @@ -171,10 +172,18 @@ static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,  		goto out;  	} +retry_lookup:  	od = osduld_info_lookup(&odi);  	if (unlikely(IS_ERR(od))) {  		err = PTR_ERR(od);  		dprintk("%s: osduld_info_lookup => %d\n", __func__, err); +		if (err == -ENODEV && retry_flag) { +			err = objlayout_autologin(deviceaddr); +			if (likely(!err)) { +				retry_flag = false; +				goto retry_lookup; +			} +		}  		goto out;  	} @@ -205,25 +214,36 @@ static void copy_single_comp(struct ore_components *oc, unsigned c,  int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,  		       struct objio_segment **pseg)  { -	struct __alloc_objio_segment { -		struct objio_segment olseg; -		struct ore_dev *ods[numdevs]; -		struct ore_comp	comps[numdevs]; -	} *aolseg; +/*	This is the in memory structure of the objio_segment + * + *	struct __alloc_objio_segment { + *		struct objio_segment olseg; + *		struct ore_dev *ods[numdevs]; + *		struct ore_comp	comps[numdevs]; + *	} *aolseg; + *	NOTE: The code as above compiles and runs perfectly. It is elegant, + *	type safe and compact. At some Past time Linus has decided he does not + *	like variable length arrays, For the sake of this principal we uglify + *	the code as below. + */ +	struct objio_segment *lseg; +	size_t lseg_size = sizeof(*lseg) + +			numdevs * sizeof(lseg->oc.ods[0]) + +			numdevs * sizeof(*lseg->oc.comps); -	aolseg = kzalloc(sizeof(*aolseg), gfp_flags); -	if (unlikely(!aolseg)) { +	lseg = kzalloc(lseg_size, gfp_flags); +	if (unlikely(!lseg)) {  		dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, -			numdevs, sizeof(*aolseg)); +			numdevs, lseg_size);  		return -ENOMEM;  	} -	aolseg->olseg.oc.numdevs = numdevs; -	aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS; -	aolseg->olseg.oc.comps = aolseg->comps; -	aolseg->olseg.oc.ods = aolseg->ods; +	lseg->oc.numdevs = numdevs; +	lseg->oc.single_comp = EC_MULTPLE_COMPS; +	lseg->oc.ods = (void *)(lseg + 1); +	lseg->oc.comps = (void *)(lseg->oc.ods + numdevs); -	*pseg = &aolseg->olseg; +	*pseg = lseg;  	return 0;  } @@ -582,10 +602,10 @@ objlayout_init(void)  	if (ret)  		printk(KERN_INFO -			"%s: Registering OSD pNFS Layout Driver failed: error=%d\n", +			"NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n",  			__func__, ret);  	else -		printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", +		printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n",  			__func__);  	return ret;  } @@ -594,7 +614,7 @@ static void __exit  objlayout_exit(void)  {  	pnfs_unregister_layoutdriver(&objlayout_type); -	printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", +	printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n",  	       __func__);  } diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index b3c29039f5b..8d45f1c318c 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -37,6 +37,9 @@   *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.   */ +#include <linux/kmod.h> +#include <linux/moduleparam.h> +#include <linux/ratelimit.h>  #include <scsi/osd_initiator.h>  #include "objlayout.h" @@ -156,7 +159,7 @@ last_byte_offset(u64 start, u64 len)  	return end > start ? end - 1 : NFS4_MAX_UINT64;  } -void _fix_verify_io_params(struct pnfs_layout_segment *lseg, +static void _fix_verify_io_params(struct pnfs_layout_segment *lseg,  			   struct page ***p_pages, unsigned *p_pgbase,  			   u64 offset, unsigned long count)  { @@ -490,9 +493,9 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)  			if (!ioerr->oer_errno)  				continue; -			printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " -				"dev(%llx:%llx) par=0x%llx obj=0x%llx " -				"offset=0x%llx length=0x%llx\n", +			printk(KERN_ERR "NFS: %s: err[%d]: errno=%d " +				"is_write=%d dev(%llx:%llx) par=0x%llx " +				"obj=0x%llx offset=0x%llx length=0x%llx\n",  				__func__, i, ioerr->oer_errno,  				ioerr->oer_iswrite,  				_DEVID_LO(&ioerr->oer_component.oid_device_id), @@ -651,3 +654,134 @@ void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)  	__free_page(odi->page);  	kfree(odi);  } + +enum { +	OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, +	OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, +	OSD_LOGIN_UPCALL_PATHLEN  = 256 +}; + +static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login"; + +module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog), +		    0600); +MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program"); + +struct __auto_login { +	char uri[OBJLAYOUT_MAX_URI_LEN]; +	char osdname[OBJLAYOUT_MAX_OSDNAME_LEN]; +	char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN]; +}; + +static int __objlayout_upcall(struct __auto_login *login) +{ +	static char *envp[] = { "HOME=/", +		"TERM=linux", +		"PATH=/sbin:/usr/sbin:/bin:/usr/bin", +		NULL +	}; +	char *argv[8]; +	int ret; + +	if (unlikely(!osd_login_prog[0])) { +		dprintk("%s: osd_login_prog is disabled\n", __func__); +		return -EACCES; +	} + +	dprintk("%s uri: %s\n", __func__, login->uri); +	dprintk("%s osdname %s\n", __func__, login->osdname); +	dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex); + +	argv[0] = (char *)osd_login_prog; +	argv[1] = "-u"; +	argv[2] = login->uri; +	argv[3] = "-o"; +	argv[4] = login->osdname; +	argv[5] = "-s"; +	argv[6] = login->systemid_hex; +	argv[7] = NULL; + +	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); +	/* +	 * Disable the upcall mechanism if we're getting an ENOENT or +	 * EACCES error. The admin can re-enable it on the fly by using +	 * sysfs to set the objlayoutdriver.osd_login_prog module parameter once +	 * the problem has been fixed. +	 */ +	if (ret == -ENOENT || ret == -EACCES) { +		printk(KERN_ERR "PNFS-OBJ: %s was not found please set " +			"objlayoutdriver.osd_login_prog kernel parameter!\n", +			osd_login_prog); +		osd_login_prog[0] = '\0'; +	} +	dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret); + +	return ret; +} + +/* Assume dest is all zeros */ +static void __copy_nfsS_and_zero_terminate(struct nfs4_string s, +					   char *dest, int max_len, +					   const char *var_name) +{ +	if (!s.len) +		return; + +	if (s.len >= max_len) { +		pr_warn_ratelimited( +			"objlayout_autologin: %s: s.len(%d) >= max_len(%d)", +			var_name, s.len, max_len); +		s.len = max_len - 1; /* space for null terminator */ +	} + +	memcpy(dest, s.data, s.len); +} + +/* Assume sysid is all zeros */ +static void _sysid_2_hex(struct nfs4_string s, +		  char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN]) +{ +	int i; +	char *cur; + +	if (!s.len) +		return; + +	if (s.len != OSD_SYSTEMID_LEN) { +		pr_warn_ratelimited( +		    "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN", +		    s.len); +		if (s.len > OSD_SYSTEMID_LEN) +			s.len = OSD_SYSTEMID_LEN; +	} + +	cur = sysid; +	for (i = 0; i < s.len; i++) +		cur = hex_byte_pack(cur, s.data[i]); +} + +int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr) +{ +	int rc; +	struct __auto_login login; + +	if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len) +		return -ENODEV; + +	memset(&login, 0, sizeof(login)); +	__copy_nfsS_and_zero_terminate( +		deviceaddr->oda_targetaddr.ota_netaddr.r_addr, +		login.uri, sizeof(login.uri), "URI"); + +	__copy_nfsS_and_zero_terminate( +		deviceaddr->oda_osdname, +		login.osdname, sizeof(login.osdname), "OSDNAME"); + +	_sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex); + +	rc = __objlayout_upcall(&login); +	if (rc > 0) /* script returns positive values */ +		rc = -ENODEV; + +	return rc; +} diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 8ec34727ed2..880ba086be9 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -184,4 +184,6 @@ extern void objlayout_encode_layoutreturn(  	struct xdr_stream *,  	const struct nfs4_layoutreturn_args *); +extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr); +  #endif /* _OBJLAYOUT_H */ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 5668f7c54c4..d21fceaa9f6 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -13,6 +13,7 @@  #include <linux/file.h>  #include <linux/sched.h>  #include <linux/sunrpc/clnt.h> +#include <linux/nfs.h>  #include <linux/nfs3.h>  #include <linux/nfs4.h>  #include <linux/nfs_page.h> @@ -106,36 +107,6 @@ void nfs_unlock_request(struct nfs_page *req)  	nfs_release_request(req);  } -/** - * nfs_set_page_tag_locked - Tag a request as locked - * @req: - */ -int nfs_set_page_tag_locked(struct nfs_page *req) -{ -	if (!nfs_lock_request_dontget(req)) -		return 0; -	if (test_bit(PG_MAPPED, &req->wb_flags)) -		radix_tree_tag_set(&NFS_I(req->wb_context->dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); -	return 1; -} - -/** - * nfs_clear_page_tag_locked - Clear request tag and wake up sleepers - */ -void nfs_clear_page_tag_locked(struct nfs_page *req) -{ -	if (test_bit(PG_MAPPED, &req->wb_flags)) { -		struct inode *inode = req->wb_context->dentry->d_inode; -		struct nfs_inode *nfsi = NFS_I(inode); - -		spin_lock(&inode->i_lock); -		radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); -		nfs_unlock_request(req); -		spin_unlock(&inode->i_lock); -	} else -		nfs_unlock_request(req); -} -  /*   * nfs_clear_request - Free up all resources allocated to the request   * @req: @@ -425,67 +396,6 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)  	}  } -#define NFS_SCAN_MAXENTRIES 16 -/** - * nfs_scan_list - Scan a list for matching requests - * @nfsi: NFS inode - * @dst: Destination list - * @idx_start: lower bound of page->index to scan - * @npages: idx_start + npages sets the upper bound to scan. - * @tag: tag to scan for - * - * Moves elements from one of the inode request lists. - * If the number of requests is set to 0, the entire address_space - * starting at index idx_start, is scanned. - * The requests are *not* checked to ensure that they form a contiguous set. - * You must be holding the inode's i_lock when calling this function - */ -int nfs_scan_list(struct nfs_inode *nfsi, -		struct list_head *dst, pgoff_t idx_start, -		unsigned int npages, int tag) -{ -	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; -	struct nfs_page *req; -	pgoff_t idx_end; -	int found, i; -	int res; -	struct list_head *list; - -	res = 0; -	if (npages == 0) -		idx_end = ~0; -	else -		idx_end = idx_start + npages - 1; - -	for (;;) { -		found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, -				(void **)&pgvec[0], idx_start, -				NFS_SCAN_MAXENTRIES, tag); -		if (found <= 0) -			break; -		for (i = 0; i < found; i++) { -			req = pgvec[i]; -			if (req->wb_index > idx_end) -				goto out; -			idx_start = req->wb_index + 1; -			if (nfs_set_page_tag_locked(req)) { -				kref_get(&req->wb_kref); -				radix_tree_tag_clear(&nfsi->nfs_page_tree, -						req->wb_index, tag); -				list = pnfs_choose_commit_list(req, dst); -				nfs_list_add_request(req, list); -				res++; -				if (res == INT_MAX) -					goto out; -			} -		} -		/* for latency reduction */ -		cond_resched_lock(&nfsi->vfs_inode.i_lock); -	} -out: -	return res; -} -  int __init nfs_init_nfspagecache(void)  {  	nfs_page_cachep = kmem_cache_create("nfs_page", diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 17149a49006..b5d45158694 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -101,8 +101,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,  		goto out_no_driver;  	if (!(server->nfs_client->cl_exchange_flags &  		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { -		printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__, -		       id, server->nfs_client->cl_exchange_flags); +		printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", +			__func__, id, server->nfs_client->cl_exchange_flags);  		goto out_no_driver;  	}  	ld_type = find_pnfs_driver(id); @@ -122,8 +122,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,  	server->pnfs_curr_ld = ld_type;  	if (ld_type->set_layoutdriver  	    && ld_type->set_layoutdriver(server, mntfh)) { -		printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n", -				__func__, id); +		printk(KERN_ERR "NFS: %s: Error initializing pNFS layout " +			"driver %u.\n", __func__, id);  		module_put(ld_type->owner);  		goto out_no_driver;  	} @@ -143,11 +143,11 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)  	struct pnfs_layoutdriver_type *tmp;  	if (ld_type->id == 0) { -		printk(KERN_ERR "%s id 0 is reserved\n", __func__); +		printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);  		return status;  	}  	if (!ld_type->alloc_lseg || !ld_type->free_lseg) { -		printk(KERN_ERR "%s Layout driver must provide " +		printk(KERN_ERR "NFS: %s Layout driver must provide "  		       "alloc_lseg and free_lseg.\n", __func__);  		return status;  	} @@ -160,7 +160,7 @@ pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)  		dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,  			ld_type->name);  	} else { -		printk(KERN_ERR "%s Module with id %d already loaded!\n", +		printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",  			__func__, ld_type->id);  	}  	spin_unlock(&pnfs_spinlock); @@ -496,12 +496,12 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,  {  	u32 oldseq, newseq; -	oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid); -	newseq = be32_to_cpu(new->stateid.seqid); +	oldseq = be32_to_cpu(lo->plh_stateid.seqid); +	newseq = be32_to_cpu(new->seqid);  	if ((int)(newseq - oldseq) > 0) { -		memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid)); +		nfs4_stateid_copy(&lo->plh_stateid, new);  		if (update_barrier) { -			u32 new_barrier = be32_to_cpu(new->stateid.seqid); +			u32 new_barrier = be32_to_cpu(new->seqid);  			if ((int)(new_barrier - lo->plh_barrier))  				lo->plh_barrier = new_barrier; @@ -525,7 +525,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,  			int lget)  {  	if ((stateid) && -	    (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) +	    (int)(lo->plh_barrier - be32_to_cpu(stateid->seqid)) >= 0)  		return true;  	return lo->plh_block_lgets ||  		test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || @@ -549,11 +549,10 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,  		do {  			seq = read_seqbegin(&open_state->seqlock); -			memcpy(dst->data, open_state->stateid.data, -			       sizeof(open_state->stateid.data)); +			nfs4_stateid_copy(dst, &open_state->stateid);  		} while (read_seqretry(&open_state->seqlock, seq));  	} else -		memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data)); +		nfs4_stateid_copy(dst, &lo->plh_stateid);  	spin_unlock(&lo->plh_inode->i_lock);  	dprintk("<-- %s\n", __func__);  	return status; @@ -590,7 +589,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,  	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;  	max_pages = max_resp_sz >> PAGE_SHIFT; -	pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags); +	pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);  	if (!pages)  		goto out_err_free; @@ -760,7 +759,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)  		}  	if (!found) {  		struct pnfs_layout_hdr *lo = nfsi->layout; -		u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid); +		u32 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);  		/* Since close does not return a layout stateid for use as  		 * a barrier, we choose the worst-case barrier. @@ -966,8 +965,7 @@ pnfs_update_layout(struct inode *ino,  	}  	/* Do we even need to bother with this? */ -	if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || -	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { +	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {  		dprintk("%s matches recall, use MDS\n", __func__);  		goto out_unlock;  	} @@ -1032,7 +1030,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)  	struct nfs4_layoutget_res *res = &lgp->res;  	struct pnfs_layout_segment *lseg;  	struct inode *ino = lo->plh_inode; -	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;  	int status = 0;  	/* Inject layout blob into I/O device driver */ @@ -1048,8 +1045,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)  	}  	spin_lock(&ino->i_lock); -	if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || -	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { +	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {  		dprintk("%s forget reply due to recall\n", __func__);  		goto out_forget_reply;  	} @@ -1214,6 +1210,7 @@ void pnfs_ld_write_done(struct nfs_write_data *data)  		}  		data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);  	} +	put_lseg(data->lseg);  	data->mds_ops->rpc_release(data);  }  EXPORT_SYMBOL_GPL(pnfs_ld_write_done); @@ -1227,6 +1224,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,  		nfs_list_add_request(data->req, &desc->pg_list);  	nfs_pageio_reset_write_mds(desc);  	desc->pg_recoalesce = 1; +	put_lseg(data->lseg);  	nfs_writedata_release(data);  } @@ -1327,6 +1325,7 @@ void pnfs_ld_read_done(struct nfs_read_data *data)  		data->mds_ops->rpc_call_done(&data->task, data);  	} else  		pnfs_ld_handle_read_error(data); +	put_lseg(data->lseg);  	data->mds_ops->rpc_release(data);  }  EXPORT_SYMBOL_GPL(pnfs_ld_read_done); @@ -1530,8 +1529,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)  	end_pos = nfsi->layout->plh_lwb;  	nfsi->layout->plh_lwb = 0; -	memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, -		sizeof(nfsi->layout->plh_stateid.data)); +	nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);  	spin_unlock(&inode->i_lock);  	data->args.inode = inode; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 53d593a0a4f..442ebf68eee 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -94,11 +94,10 @@ struct pnfs_layoutdriver_type {  	const struct nfs_pageio_ops *pg_read_ops;  	const struct nfs_pageio_ops *pg_write_ops; -	/* Returns true if layoutdriver wants to divert this request to -	 * driver's commit routine. -	 */ -	bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg); -	struct list_head * (*choose_commit_list) (struct nfs_page *req); +	void (*mark_request_commit) (struct nfs_page *req, +					struct pnfs_layout_segment *lseg); +	void (*clear_request_commit) (struct nfs_page *req); +	int (*scan_commit_lists) (struct inode *inode, int max, spinlock_t *lock);  	int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how);  	/* @@ -229,7 +228,6 @@ struct nfs4_deviceid_node {  	atomic_t			ref;  }; -void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);  struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);  void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);  void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, @@ -262,20 +260,6 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss)  	return nfss->pnfs_curr_ld != NULL;  } -static inline void -pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) -{ -	if (lseg) { -		struct pnfs_layoutdriver_type *ld; - -		ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld; -		if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) { -			set_bit(PG_PNFS_COMMIT, &req->wb_flags); -			req->wb_commit_lseg = get_lseg(lseg); -		} -	} -} -  static inline int  pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)  { @@ -284,27 +268,42 @@ pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)  	return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how);  } -static inline struct list_head * -pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +static inline bool +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)  { -	struct list_head *rv; +	struct inode *inode = req->wb_context->dentry->d_inode; +	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; -	if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) { -		struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode; +	if (lseg == NULL || ld->mark_request_commit == NULL) +		return false; +	ld->mark_request_commit(req, lseg); +	return true; +} -		set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags); -		rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req); -		/* matched by ref taken when PG_PNFS_COMMIT is set */ -		put_lseg(req->wb_commit_lseg); -	} else -		rv = mds; -	return rv; +static inline bool +pnfs_clear_request_commit(struct nfs_page *req) +{ +	struct inode *inode = req->wb_context->dentry->d_inode; +	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + +	if (ld == NULL || ld->clear_request_commit == NULL) +		return false; +	ld->clear_request_commit(req); +	return true;  } -static inline void pnfs_clear_request_commit(struct nfs_page *req) +static inline int +pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)  { -	if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) -		put_lseg(req->wb_commit_lseg); +	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; +	int ret; + +	if (ld == NULL || ld->scan_commit_lists == NULL) +		return 0; +	ret = ld->scan_commit_lists(inode, max, lock); +	if (ret != 0) +		set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags); +	return ret;  }  /* Should the pNFS client commit and return the layout upon a setattr */ @@ -328,6 +327,13 @@ static inline int pnfs_return_layout(struct inode *ino)  	return 0;  } +#ifdef NFS_DEBUG +void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); +#else +static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id) +{ +} +#endif /* NFS_DEBUG */  #else  /* CONFIG_NFS_V4_1 */  static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -400,35 +406,35 @@ static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, st  	return false;  } -static inline void -pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) -{ -} -  static inline int  pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)  {  	return PNFS_NOT_ATTEMPTED;  } -static inline struct list_head * -pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +static inline bool +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)  { -	return mds; +	return false;  } -static inline void pnfs_clear_request_commit(struct nfs_page *req) +static inline bool +pnfs_clear_request_commit(struct nfs_page *req)  { +	return false;  } -static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) +static inline int +pnfs_scan_commit_lists(struct inode *inode, int max, spinlock_t *lock)  {  	return 0;  } -static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl) +static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)  { +	return 0;  } +  #endif /* CONFIG_NFS_V4_1 */  #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 4f359d2a26e..73f701f1f4d 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -43,6 +43,7 @@  static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];  static DEFINE_SPINLOCK(nfs4_deviceid_lock); +#ifdef NFS_DEBUG  void  nfs4_print_deviceid(const struct nfs4_deviceid *id)  { @@ -52,6 +53,7 @@ nfs4_print_deviceid(const struct nfs4_deviceid *id)  		p[0], p[1], p[2], p[3]);  }  EXPORT_SYMBOL_GPL(nfs4_print_deviceid); +#endif  static inline u32  nfs4_deviceid_hash(const struct nfs4_deviceid *id) @@ -92,7 +94,7 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,   * @clp nfs_client associated with deviceid   * @id deviceid to look up   */ -struct nfs4_deviceid_node * +static struct nfs4_deviceid_node *  _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,  		   const struct nfs_client *clp, const struct nfs4_deviceid *id,  		   long hash) diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 0c672588fe5..b63b6f4d14f 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -358,6 +358,11 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)  	msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE];  } +static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) +{ +	rpc_call_start(task); +} +  static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)  {  	if (nfs_async_handle_expired_key(task)) @@ -372,6 +377,11 @@ nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir)  	msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME];  } +static void nfs_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) +{ +	rpc_call_start(task); +} +  static int  nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,  		     struct inode *new_dir) @@ -651,6 +661,11 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *  	msg->rpc_proc = &nfs_procedures[NFSPROC_READ];  } +static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +{ +	rpc_call_start(task); +} +  static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)  {  	if (nfs_async_handle_expired_key(task)) @@ -668,6 +683,11 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message  	msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];  } +static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) +{ +	rpc_call_start(task); +} +  static void  nfs_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)  { @@ -721,9 +741,11 @@ const struct nfs_rpc_ops nfs_v2_clientops = {  	.create		= nfs_proc_create,  	.remove		= nfs_proc_remove,  	.unlink_setup	= nfs_proc_unlink_setup, +	.unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,  	.unlink_done	= nfs_proc_unlink_done,  	.rename		= nfs_proc_rename,  	.rename_setup	= nfs_proc_rename_setup, +	.rename_rpc_prepare = nfs_proc_rename_rpc_prepare,  	.rename_done	= nfs_proc_rename_done,  	.link		= nfs_proc_link,  	.symlink	= nfs_proc_symlink, @@ -736,8 +758,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = {  	.pathconf	= nfs_proc_pathconf,  	.decode_dirent	= nfs2_decode_dirent,  	.read_setup	= nfs_proc_read_setup, +	.read_rpc_prepare = nfs_proc_read_rpc_prepare,  	.read_done	= nfs_read_done,  	.write_setup	= nfs_proc_write_setup, +	.write_rpc_prepare = nfs_proc_write_rpc_prepare,  	.write_done	= nfs_write_done,  	.commit_setup	= nfs_proc_commit_setup,  	.lock		= nfs_proc_lock, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index cfa175c223d..cc1f758a7ee 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -66,7 +66,6 @@ void nfs_readdata_free(struct nfs_read_data *p)  void nfs_readdata_release(struct nfs_read_data *rdata)  { -	put_lseg(rdata->lseg);  	put_nfs_open_context(rdata->args.context);  	nfs_readdata_free(rdata);  } @@ -465,23 +464,14 @@ static void nfs_readpage_release_partial(void *calldata)  	nfs_readdata_release(calldata);  } -#if defined(CONFIG_NFS_V4_1)  void nfs_read_prepare(struct rpc_task *task, void *calldata)  {  	struct nfs_read_data *data = calldata; - -	if (nfs4_setup_sequence(NFS_SERVER(data->inode), -				&data->args.seq_args, &data->res.seq_res, -				0, task)) -		return; -	rpc_call_start(task); +	NFS_PROTO(data->inode)->read_rpc_prepare(task, data);  } -#endif /* CONFIG_NFS_V4_1 */  static const struct rpc_call_ops nfs_read_partial_ops = { -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_read_prepare, -#endif /* CONFIG_NFS_V4_1 */  	.rpc_call_done = nfs_readpage_result_partial,  	.rpc_release = nfs_readpage_release_partial,  }; @@ -545,9 +535,7 @@ static void nfs_readpage_release_full(void *calldata)  }  static const struct rpc_call_ops nfs_read_full_ops = { -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_read_prepare, -#endif /* CONFIG_NFS_V4_1 */  	.rpc_call_done = nfs_readpage_result_full,  	.rpc_release = nfs_readpage_release_full,  }; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 3dfa4f112c0..ccc4cdb1efe 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -52,6 +52,8 @@  #include <linux/nfs_xdr.h>  #include <linux/magic.h>  #include <linux/parser.h> +#include <linux/nsproxy.h> +#include <linux/rcupdate.h>  #include <asm/system.h>  #include <asm/uaccess.h> @@ -79,7 +81,6 @@ enum {  	Opt_cto, Opt_nocto,  	Opt_ac, Opt_noac,  	Opt_lock, Opt_nolock, -	Opt_v2, Opt_v3, Opt_v4,  	Opt_udp, Opt_tcp, Opt_rdma,  	Opt_acl, Opt_noacl,  	Opt_rdirplus, Opt_nordirplus, @@ -97,10 +98,10 @@ enum {  	Opt_namelen,  	Opt_mountport,  	Opt_mountvers, -	Opt_nfsvers,  	Opt_minorversion,  	/* Mount options that take string arguments */ +	Opt_nfsvers,  	Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,  	Opt_addr, Opt_mountaddr, Opt_clientaddr,  	Opt_lookupcache, @@ -132,9 +133,6 @@ static const match_table_t nfs_mount_option_tokens = {  	{ Opt_noac, "noac" },  	{ Opt_lock, "lock" },  	{ Opt_nolock, "nolock" }, -	{ Opt_v2, "v2" }, -	{ Opt_v3, "v3" }, -	{ Opt_v4, "v4" },  	{ Opt_udp, "udp" },  	{ Opt_tcp, "tcp" },  	{ Opt_rdma, "rdma" }, @@ -163,9 +161,10 @@ static const match_table_t nfs_mount_option_tokens = {  	{ Opt_namelen, "namlen=%s" },  	{ Opt_mountport, "mountport=%s" },  	{ Opt_mountvers, "mountvers=%s" }, +	{ Opt_minorversion, "minorversion=%s" }, +  	{ Opt_nfsvers, "nfsvers=%s" },  	{ Opt_nfsvers, "vers=%s" }, -	{ Opt_minorversion, "minorversion=%s" },  	{ Opt_sec, "sec=%s" },  	{ Opt_proto, "proto=%s" }, @@ -179,6 +178,9 @@ static const match_table_t nfs_mount_option_tokens = {  	{ Opt_fscache_uniq, "fsc=%s" },  	{ Opt_local_lock, "local_lock=%s" }, +	/* The following needs to be listed after all other options */ +	{ Opt_nfsvers, "v%s" }, +  	{ Opt_err, NULL }  }; @@ -259,6 +261,22 @@ static match_table_t nfs_local_lock_tokens = {  	{ Opt_local_lock_err, NULL }  }; +enum { +	Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, +	Opt_vers_4_1, + +	Opt_vers_err +}; + +static match_table_t nfs_vers_tokens = { +	{ Opt_vers_2, "2" }, +	{ Opt_vers_3, "3" }, +	{ Opt_vers_4, "4" }, +	{ Opt_vers_4_0, "4.0" }, +	{ Opt_vers_4_1, "4.1" }, + +	{ Opt_vers_err, NULL } +};  static void nfs_umount_begin(struct super_block *);  static int  nfs_statfs(struct dentry *, struct kstatfs *); @@ -620,7 +638,6 @@ static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,  	struct nfs_client *clp = nfss->nfs_client;  	seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); -	seq_printf(m, ",minorversion=%u", clp->cl_minorversion);  }  #else  static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss, @@ -629,6 +646,15 @@ static void nfs_show_nfsv4_options(struct seq_file *m, struct nfs_server *nfss,  }  #endif +static void nfs_show_nfs_version(struct seq_file *m, +		unsigned int version, +		unsigned int minorversion) +{ +	seq_printf(m, ",vers=%u", version); +	if (version == 4) +		seq_printf(m, ".%u", minorversion); +} +  /*   * Describe the mount options in force on this server representation   */ @@ -656,7 +682,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,  	u32 version = clp->rpc_ops->version;  	int local_flock, local_fcntl; -	seq_printf(m, ",vers=%u", version); +	nfs_show_nfs_version(m, version, clp->cl_minorversion);  	seq_printf(m, ",rsize=%u", nfss->rsize);  	seq_printf(m, ",wsize=%u", nfss->wsize);  	if (nfss->bsize != 0) @@ -676,8 +702,10 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,  		else  			seq_puts(m, nfs_infop->nostr);  	} +	rcu_read_lock();  	seq_printf(m, ",proto=%s",  		   rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); +	rcu_read_unlock();  	if (version == 4) {  		if (nfss->port != NFS_PORT)  			seq_printf(m, ",port=%u", nfss->port); @@ -726,9 +754,11 @@ static int nfs_show_options(struct seq_file *m, struct dentry *root)  	nfs_show_mount_options(m, nfss, 0); +	rcu_read_lock();  	seq_printf(m, ",addr=%s",  			rpc_peeraddr2str(nfss->nfs_client->cl_rpcclient,  							RPC_DISPLAY_ADDR)); +	rcu_read_unlock();  	return 0;  } @@ -745,7 +775,6 @@ static void show_sessions(struct seq_file *m, struct nfs_server *server) {}  #endif  #endif -#ifdef CONFIG_NFS_V4  #ifdef CONFIG_NFS_V4_1  static void show_pnfs(struct seq_file *m, struct nfs_server *server)  { @@ -755,9 +784,26 @@ static void show_pnfs(struct seq_file *m, struct nfs_server *server)  	else  		seq_printf(m, "not configured");  } + +static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) +{ +	if (nfss->nfs_client && nfss->nfs_client->impl_id) { +		struct nfs41_impl_id *impl_id = nfss->nfs_client->impl_id; +		seq_printf(m, "\n\timpl_id:\tname='%s',domain='%s'," +			   "date='%llu,%u'", +			   impl_id->name, impl_id->domain, +			   impl_id->date.seconds, impl_id->date.nseconds); +	} +}  #else -static void show_pnfs(struct seq_file *m, struct nfs_server *server) {} +#ifdef CONFIG_NFS_V4 +static void show_pnfs(struct seq_file *m, struct nfs_server *server) +{ +}  #endif +static void show_implementation_id(struct seq_file *m, struct nfs_server *nfss) +{ +}  #endif  static int nfs_show_devname(struct seq_file *m, struct dentry *root) @@ -806,6 +852,8 @@ static int nfs_show_stats(struct seq_file *m, struct dentry *root)  	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); +	show_implementation_id(m, nfss); +  	seq_printf(m, "\n\tcaps:\t");  	seq_printf(m, "caps=0x%x", nfss->caps);  	seq_printf(m, ",wtmult=%u", nfss->wtmult); @@ -908,6 +956,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve  		data->auth_flavor_len	= 1;  		data->version		= version;  		data->minorversion	= 0; +		data->net		= current->nsproxy->net_ns;  		security_init_mnt_opts(&data->lsm_opts);  	}  	return data; @@ -1052,6 +1101,40 @@ static int nfs_parse_security_flavors(char *value,  	return 1;  } +static int nfs_parse_version_string(char *string, +		struct nfs_parsed_mount_data *mnt, +		substring_t *args) +{ +	mnt->flags &= ~NFS_MOUNT_VER3; +	switch (match_token(string, nfs_vers_tokens, args)) { +	case Opt_vers_2: +		mnt->version = 2; +		break; +	case Opt_vers_3: +		mnt->flags |= NFS_MOUNT_VER3; +		mnt->version = 3; +		break; +	case Opt_vers_4: +		/* Backward compatibility option. In future, +		 * the mount program should always supply +		 * a NFSv4 minor version number. +		 */ +		mnt->version = 4; +		break; +	case Opt_vers_4_0: +		mnt->version = 4; +		mnt->minorversion = 0; +		break; +	case Opt_vers_4_1: +		mnt->version = 4; +		mnt->minorversion = 1; +		break; +	default: +		return 0; +	} +	return 1; +} +  static int nfs_get_option_str(substring_t args[], char **option)  {  	kfree(*option); @@ -1157,18 +1240,6 @@ static int nfs_parse_mount_options(char *raw,  			mnt->flags |= (NFS_MOUNT_LOCAL_FLOCK |  				       NFS_MOUNT_LOCAL_FCNTL);  			break; -		case Opt_v2: -			mnt->flags &= ~NFS_MOUNT_VER3; -			mnt->version = 2; -			break; -		case Opt_v3: -			mnt->flags |= NFS_MOUNT_VER3; -			mnt->version = 3; -			break; -		case Opt_v4: -			mnt->flags &= ~NFS_MOUNT_VER3; -			mnt->version = 4; -			break;  		case Opt_udp:  			mnt->flags &= ~NFS_MOUNT_TCP;  			mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; @@ -1295,26 +1366,6 @@ static int nfs_parse_mount_options(char *raw,  				goto out_invalid_value;  			mnt->mount_server.version = option;  			break; -		case Opt_nfsvers: -			if (nfs_get_option_ul(args, &option)) -				goto out_invalid_value; -			switch (option) { -			case NFS2_VERSION: -				mnt->flags &= ~NFS_MOUNT_VER3; -				mnt->version = 2; -				break; -			case NFS3_VERSION: -				mnt->flags |= NFS_MOUNT_VER3; -				mnt->version = 3; -				break; -			case NFS4_VERSION: -				mnt->flags &= ~NFS_MOUNT_VER3; -				mnt->version = 4; -				break; -			default: -				goto out_invalid_value; -			} -			break;  		case Opt_minorversion:  			if (nfs_get_option_ul(args, &option))  				goto out_invalid_value; @@ -1326,6 +1377,15 @@ static int nfs_parse_mount_options(char *raw,  		/*  		 * options that take text values  		 */ +		case Opt_nfsvers: +			string = match_strdup(args); +			if (string == NULL) +				goto out_nomem; +			rc = nfs_parse_version_string(string, mnt, args); +			kfree(string); +			if (!rc) +				goto out_invalid_value; +			break;  		case Opt_sec:  			string = match_strdup(args);  			if (string == NULL) @@ -1405,7 +1465,7 @@ static int nfs_parse_mount_options(char *raw,  			if (string == NULL)  				goto out_nomem;  			mnt->nfs_server.addrlen = -				rpc_pton(string, strlen(string), +				rpc_pton(mnt->net, string, strlen(string),  					(struct sockaddr *)  					&mnt->nfs_server.address,  					sizeof(mnt->nfs_server.address)); @@ -1427,7 +1487,7 @@ static int nfs_parse_mount_options(char *raw,  			if (string == NULL)  				goto out_nomem;  			mnt->mount_server.addrlen = -				rpc_pton(string, strlen(string), +				rpc_pton(mnt->net, string, strlen(string),  					(struct sockaddr *)  					&mnt->mount_server.address,  					sizeof(mnt->mount_server.address)); @@ -1516,6 +1576,9 @@ static int nfs_parse_mount_options(char *raw,  	if (!sloppy && invalid_option)  		return 0; +	if (mnt->minorversion && mnt->version != 4) +		goto out_minorversion_mismatch; +  	/*  	 * verify that any proto=/mountproto= options match the address  	 * familiies in the addr=/mountaddr= options. @@ -1549,6 +1612,10 @@ out_invalid_address:  out_invalid_value:  	printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);  	return 0; +out_minorversion_mismatch: +	printk(KERN_INFO "NFS: mount option vers=%u does not support " +			 "minorversion=%u\n", mnt->version, mnt->minorversion); +	return 0;  out_nomem:  	printk(KERN_INFO "NFS: not enough memory to parse option\n");  	return 0; @@ -1622,6 +1689,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,  		.noresvport	= args->flags & NFS_MOUNT_NORESVPORT,  		.auth_flav_len	= &server_authlist_len,  		.auth_flavs	= server_authlist, +		.net		= args->net,  	};  	int status; @@ -2047,7 +2115,7 @@ static inline void nfs_initialise_sb(struct super_block *sb)  	/* We probably want something more informative here */  	snprintf(sb->s_id, sizeof(sb->s_id), -		 "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev)); +		 "%u:%u", MAJOR(sb->s_dev), MINOR(sb->s_dev));  	if (sb->s_blocksize == 0)  		sb->s_blocksize = nfs_block_bits(server->wsize, @@ -2499,12 +2567,6 @@ static int nfs4_validate_text_mount_data(void *options,  		return -EINVAL;  	} -	if (args->client_address == NULL) { -		dfprintk(MOUNT, -			 "NFS4: mount program didn't pass callback address\n"); -		return -EINVAL; -	} -  	return nfs_parse_devname(dev_name,  				   &args->nfs_server.hostname,  				   NFS4_MAXNAMLEN, @@ -2663,8 +2725,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,  	if (!s->s_root) {  		/* initial superblock/root creation */  		nfs4_fill_super(s); -		nfs_fscache_get_super_cookie( -			s, data ? data->fscache_uniq : NULL, NULL); +		nfs_fscache_get_super_cookie(s, data->fscache_uniq, NULL);  	}  	mntroot = nfs4_get_root(s, mntfh, dev_name); diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 978aaeb8a09..ad4d2e787b2 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -32,7 +32,6 @@ static ctl_table nfs_cb_sysctls[] = {  		.extra1 = (int *)&nfs_set_port_min,  		.extra2 = (int *)&nfs_set_port_max,  	}, -#ifndef CONFIG_NFS_USE_NEW_IDMAPPER  	{  		.procname = "idmap_cache_timeout",  		.data = &nfs_idmap_cache_timeout, @@ -40,7 +39,6 @@ static ctl_table nfs_cb_sysctls[] = {  		.mode = 0644,  		.proc_handler = proc_dointvec_jiffies,  	}, -#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */  #endif  	{  		.procname	= "nfs_mountpoint_timeout", diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 4f9319a2e56..3210a03342f 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -20,15 +20,6 @@  #include "iostat.h"  #include "delegation.h" -struct nfs_unlinkdata { -	struct hlist_node list; -	struct nfs_removeargs args; -	struct nfs_removeres res; -	struct inode *dir; -	struct rpc_cred	*cred; -	struct nfs_fattr dir_attr; -}; -  /**   * nfs_free_unlinkdata - release data from a sillydelete operation.   * @data: pointer to unlink structure. @@ -107,25 +98,16 @@ static void nfs_async_unlink_release(void *calldata)  	nfs_sb_deactive(sb);  } -#if defined(CONFIG_NFS_V4_1) -void nfs_unlink_prepare(struct rpc_task *task, void *calldata) +static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)  {  	struct nfs_unlinkdata *data = calldata; -	struct nfs_server *server = NFS_SERVER(data->dir); - -	if (nfs4_setup_sequence(server, &data->args.seq_args, -				&data->res.seq_res, 1, task)) -		return; -	rpc_call_start(task); +	NFS_PROTO(data->dir)->unlink_rpc_prepare(task, data);  } -#endif /* CONFIG_NFS_V4_1 */  static const struct rpc_call_ops nfs_unlink_ops = {  	.rpc_call_done = nfs_async_unlink_done,  	.rpc_release = nfs_async_unlink_release, -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_unlink_prepare, -#endif /* CONFIG_NFS_V4_1 */  };  static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct nfs_unlinkdata *data) @@ -341,18 +323,6 @@ nfs_cancel_async_unlink(struct dentry *dentry)  	spin_unlock(&dentry->d_lock);  } -struct nfs_renamedata { -	struct nfs_renameargs	args; -	struct nfs_renameres	res; -	struct rpc_cred		*cred; -	struct inode		*old_dir; -	struct dentry		*old_dentry; -	struct nfs_fattr	old_fattr; -	struct inode		*new_dir; -	struct dentry		*new_dentry; -	struct nfs_fattr	new_fattr; -}; -  /**   * nfs_async_rename_done - Sillyrename post-processing   * @task: rpc_task of the sillyrename @@ -403,25 +373,16 @@ static void nfs_async_rename_release(void *calldata)  	kfree(data);  } -#if defined(CONFIG_NFS_V4_1)  static void nfs_rename_prepare(struct rpc_task *task, void *calldata)  {  	struct nfs_renamedata *data = calldata; -	struct nfs_server *server = NFS_SERVER(data->old_dir); - -	if (nfs4_setup_sequence(server, &data->args.seq_args, -				&data->res.seq_res, 1, task)) -		return; -	rpc_call_start(task); +	NFS_PROTO(data->old_dir)->rename_rpc_prepare(task, data);  } -#endif /* CONFIG_NFS_V4_1 */  static const struct rpc_call_ops nfs_rename_ops = {  	.rpc_call_done = nfs_async_rename_done,  	.rpc_release = nfs_async_rename_release, -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_rename_prepare, -#endif /* CONFIG_NFS_V4_1 */  };  /** diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 834f0fe96f8..2c68818f68a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -100,7 +100,6 @@ void nfs_writedata_free(struct nfs_write_data *p)  void nfs_writedata_release(struct nfs_write_data *wdata)  { -	put_lseg(wdata->lseg);  	put_nfs_open_context(wdata->args.context);  	nfs_writedata_free(wdata);  } @@ -236,10 +235,10 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblo  		req = nfs_page_find_request_locked(page);  		if (req == NULL)  			break; -		if (nfs_set_page_tag_locked(req)) +		if (nfs_lock_request_dontget(req))  			break;  		/* Note: If we hold the page lock, as is the case in nfs_writepage, -		 *	 then the call to nfs_set_page_tag_locked() will always +		 *	 then the call to nfs_lock_request_dontget() will always  		 *	 succeed provided that someone hasn't already marked the  		 *	 request as dirty (in which case we don't care).  		 */ @@ -375,21 +374,14 @@ out_err:  /*   * Insert a write request into an inode   */ -static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)  {  	struct nfs_inode *nfsi = NFS_I(inode); -	int error; - -	error = radix_tree_preload(GFP_NOFS); -	if (error != 0) -		goto out;  	/* Lock the request! */  	nfs_lock_request_dontget(req);  	spin_lock(&inode->i_lock); -	error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); -	BUG_ON(error);  	if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))  		inode->i_version++;  	set_bit(PG_MAPPED, &req->wb_flags); @@ -397,12 +389,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)  	set_page_private(req->wb_page, (unsigned long)req);  	nfsi->npages++;  	kref_get(&req->wb_kref); -	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, -				NFS_PAGE_TAG_LOCKED);  	spin_unlock(&inode->i_lock); -	radix_tree_preload_end(); -out: -	return error;  }  /* @@ -419,7 +406,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)  	set_page_private(req->wb_page, 0);  	ClearPagePrivate(req->wb_page);  	clear_bit(PG_MAPPED, &req->wb_flags); -	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);  	nfsi->npages--;  	spin_unlock(&inode->i_lock);  	nfs_release_request(req); @@ -432,39 +418,90 @@ nfs_mark_request_dirty(struct nfs_page *req)  }  #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -/* - * Add a request to the inode's commit list. +/** + * nfs_request_add_commit_list - add request to a commit list + * @req: pointer to a struct nfs_page + * @head: commit list head + * + * This sets the PG_CLEAN bit, updates the inode global count of + * number of outstanding requests requiring a commit as well as + * the MM page stats. + * + * The caller must _not_ hold the inode->i_lock, but must be + * holding the nfs_page lock.   */ -static void -nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +void +nfs_request_add_commit_list(struct nfs_page *req, struct list_head *head)  {  	struct inode *inode = req->wb_context->dentry->d_inode; -	struct nfs_inode *nfsi = NFS_I(inode); -	spin_lock(&inode->i_lock);  	set_bit(PG_CLEAN, &(req)->wb_flags); -	radix_tree_tag_set(&nfsi->nfs_page_tree, -			req->wb_index, -			NFS_PAGE_TAG_COMMIT); -	nfsi->ncommit++; +	spin_lock(&inode->i_lock); +	nfs_list_add_request(req, head); +	NFS_I(inode)->ncommit++;  	spin_unlock(&inode->i_lock); -	pnfs_mark_request_commit(req, lseg);  	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);  	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);  	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);  } +EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); -static int +/** + * nfs_request_remove_commit_list - Remove request from a commit list + * @req: pointer to a nfs_page + * + * This clears the PG_CLEAN bit, and updates the inode global count of + * number of outstanding requests requiring a commit + * It does not update the MM page stats. + * + * The caller _must_ hold the inode->i_lock and the nfs_page lock. + */ +void +nfs_request_remove_commit_list(struct nfs_page *req) +{ +	struct inode *inode = req->wb_context->dentry->d_inode; + +	if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) +		return; +	nfs_list_remove_request(req); +	NFS_I(inode)->ncommit--; +} +EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); + + +/* + * Add a request to the inode's commit list. + */ +static void +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ +	struct inode *inode = req->wb_context->dentry->d_inode; + +	if (pnfs_mark_request_commit(req, lseg)) +		return; +	nfs_request_add_commit_list(req, &NFS_I(inode)->commit_list); +} + +static void +nfs_clear_page_commit(struct page *page) +{ +	dec_zone_page_state(page, NR_UNSTABLE_NFS); +	dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); +} + +static void  nfs_clear_request_commit(struct nfs_page *req)  { -	struct page *page = req->wb_page; +	if (test_bit(PG_CLEAN, &req->wb_flags)) { +		struct inode *inode = req->wb_context->dentry->d_inode; -	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) { -		dec_zone_page_state(page, NR_UNSTABLE_NFS); -		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); -		return 1; +		if (!pnfs_clear_request_commit(req)) { +			spin_lock(&inode->i_lock); +			nfs_request_remove_commit_list(req); +			spin_unlock(&inode->i_lock); +		} +		nfs_clear_page_commit(req->wb_page);  	} -	return 0;  }  static inline @@ -491,15 +528,14 @@ int nfs_reschedule_unstable_write(struct nfs_page *req,  	return 0;  }  #else -static inline void +static void  nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)  {  } -static inline int +static void  nfs_clear_request_commit(struct nfs_page *req)  { -	return 0;  }  static inline @@ -520,46 +556,65 @@ int nfs_reschedule_unstable_write(struct nfs_page *req,  static int  nfs_need_commit(struct nfs_inode *nfsi)  { -	return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT); +	return nfsi->ncommit > 0; +} + +/* i_lock held by caller */ +static int +nfs_scan_commit_list(struct list_head *src, struct list_head *dst, int max, +		spinlock_t *lock) +{ +	struct nfs_page *req, *tmp; +	int ret = 0; + +	list_for_each_entry_safe(req, tmp, src, wb_list) { +		if (!nfs_lock_request(req)) +			continue; +		if (cond_resched_lock(lock)) +			list_safe_reset_next(req, tmp, wb_list); +		nfs_request_remove_commit_list(req); +		nfs_list_add_request(req, dst); +		ret++; +		if (ret == max) +			break; +	} +	return ret;  }  /*   * nfs_scan_commit - Scan an inode for commit requests   * @inode: NFS inode to scan   * @dst: destination list - * @idx_start: lower bound of page->index to scan. - * @npages: idx_start + npages sets the upper bound to scan.   *   * Moves requests from the inode's 'commit' request list.   * The requests are *not* checked to ensure that they form a contiguous set.   */  static int -nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) +nfs_scan_commit(struct inode *inode, struct list_head *dst)  {  	struct nfs_inode *nfsi = NFS_I(inode); -	int ret; - -	if (!nfs_need_commit(nfsi)) -		return 0; +	int ret = 0;  	spin_lock(&inode->i_lock); -	ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); -	if (ret > 0) -		nfsi->ncommit -= ret; -	spin_unlock(&inode->i_lock); - -	if (nfs_need_commit(NFS_I(inode))) -		__mark_inode_dirty(inode, I_DIRTY_DATASYNC); +	if (nfsi->ncommit > 0) { +		const int max = INT_MAX; +		ret = nfs_scan_commit_list(&nfsi->commit_list, dst, max, +				&inode->i_lock); +		ret += pnfs_scan_commit_lists(inode, max - ret, +				&inode->i_lock); +	} +	spin_unlock(&inode->i_lock);  	return ret;  } +  #else  static inline int nfs_need_commit(struct nfs_inode *nfsi)  {  	return 0;  } -static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) +static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst)  {  	return 0;  } @@ -604,7 +659,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,  		    || end < req->wb_offset)  			goto out_flushme; -		if (nfs_set_page_tag_locked(req)) +		if (nfs_lock_request_dontget(req))  			break;  		/* The request is locked, so wait and then retry */ @@ -616,13 +671,6 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,  		spin_lock(&inode->i_lock);  	} -	if (nfs_clear_request_commit(req) && -	    radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, -				 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) { -		NFS_I(inode)->ncommit--; -		pnfs_clear_request_commit(req); -	} -  	/* Okay, the request matches. Update the region */  	if (offset < req->wb_offset) {  		req->wb_offset = offset; @@ -634,6 +682,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,  		req->wb_bytes = rqend - req->wb_offset;  out_unlock:  	spin_unlock(&inode->i_lock); +	nfs_clear_request_commit(req);  	return req;  out_flushme:  	spin_unlock(&inode->i_lock); @@ -655,7 +704,6 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,  {  	struct inode *inode = page->mapping->host;  	struct nfs_page	*req; -	int error;  	req = nfs_try_to_update_request(inode, page, offset, bytes);  	if (req != NULL) @@ -663,11 +711,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,  	req = nfs_create_request(ctx, inode, page, offset, bytes);  	if (IS_ERR(req))  		goto out; -	error = nfs_inode_add_request(inode, req); -	if (error != 0) { -		nfs_release_request(req); -		req = ERR_PTR(error); -	} +	nfs_inode_add_request(inode, req);  out:  	return req;  } @@ -684,7 +728,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,  	nfs_grow_file(page, offset, count);  	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);  	nfs_mark_request_dirty(req); -	nfs_clear_page_tag_locked(req); +	nfs_unlock_request(req);  	return 0;  } @@ -777,7 +821,7 @@ static void nfs_writepage_release(struct nfs_page *req,  	if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))  		nfs_inode_remove_request(req); -	nfs_clear_page_tag_locked(req); +	nfs_unlock_request(req);  	nfs_end_page_writeback(page);  } @@ -925,7 +969,7 @@ static void nfs_redirty_request(struct nfs_page *req)  	struct page *page = req->wb_page;  	nfs_mark_request_dirty(req); -	nfs_clear_page_tag_locked(req); +	nfs_unlock_request(req);  	nfs_end_page_writeback(page);  } @@ -1128,23 +1172,14 @@ out:  	nfs_writedata_release(calldata);  } -#if defined(CONFIG_NFS_V4_1)  void nfs_write_prepare(struct rpc_task *task, void *calldata)  {  	struct nfs_write_data *data = calldata; - -	if (nfs4_setup_sequence(NFS_SERVER(data->inode), -				&data->args.seq_args, -				&data->res.seq_res, 1, task)) -		return; -	rpc_call_start(task); +	NFS_PROTO(data->inode)->write_rpc_prepare(task, data);  } -#endif /* CONFIG_NFS_V4_1 */  static const struct rpc_call_ops nfs_write_partial_ops = { -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */  	.rpc_call_done = nfs_writeback_done_partial,  	.rpc_release = nfs_writeback_release_partial,  }; @@ -1199,16 +1234,14 @@ static void nfs_writeback_release_full(void *calldata)  remove_request:  		nfs_inode_remove_request(req);  	next: -		nfs_clear_page_tag_locked(req); +		nfs_unlock_request(req);  		nfs_end_page_writeback(page);  	}  	nfs_writedata_release(calldata);  }  static const struct rpc_call_ops nfs_write_full_ops = { -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */  	.rpc_call_done = nfs_writeback_done_full,  	.rpc_release = nfs_writeback_release_full,  }; @@ -1325,7 +1358,6 @@ void nfs_commitdata_release(void *data)  {  	struct nfs_write_data *wdata = data; -	put_lseg(wdata->lseg);  	put_nfs_open_context(wdata->args.context);  	nfs_commit_free(wdata);  } @@ -1411,7 +1443,7 @@ void nfs_retry_commit(struct list_head *page_list,  		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);  		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,  			     BDI_RECLAIMABLE); -		nfs_clear_page_tag_locked(req); +		nfs_unlock_request(req);  	}  }  EXPORT_SYMBOL_GPL(nfs_retry_commit); @@ -1460,7 +1492,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data)  	while (!list_empty(&data->pages)) {  		req = nfs_list_entry(data->pages.next);  		nfs_list_remove_request(req); -		nfs_clear_request_commit(req); +		nfs_clear_page_commit(req->wb_page);  		dprintk("NFS:       commit (%s/%lld %d@%lld)",  			req->wb_context->dentry->d_sb->s_id, @@ -1486,7 +1518,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data)  		dprintk(" mismatch\n");  		nfs_mark_request_dirty(req);  	next: -		nfs_clear_page_tag_locked(req); +		nfs_unlock_request(req);  	}  }  EXPORT_SYMBOL_GPL(nfs_commit_release_pages); @@ -1501,9 +1533,7 @@ static void nfs_commit_release(void *calldata)  }  static const struct rpc_call_ops nfs_commit_ops = { -#if defined(CONFIG_NFS_V4_1)  	.rpc_call_prepare = nfs_write_prepare, -#endif /* CONFIG_NFS_V4_1 */  	.rpc_call_done = nfs_commit_done,  	.rpc_release = nfs_commit_release,  }; @@ -1517,7 +1547,7 @@ int nfs_commit_inode(struct inode *inode, int how)  	res = nfs_commit_set_lock(NFS_I(inode), may_wait);  	if (res <= 0)  		goto out_mark_dirty; -	res = nfs_scan_commit(inode, &head, 0, 0); +	res = nfs_scan_commit(inode, &head);  	if (res) {  		int error; @@ -1635,6 +1665,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)  		if (req == NULL)  			break;  		if (nfs_lock_request_dontget(req)) { +			nfs_clear_request_commit(req);  			nfs_inode_remove_request(req);  			/*  			 * In case nfs_inode_remove_request has marked the diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index ce7f0758d84..9559ce46873 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -72,7 +72,7 @@ int nfsd_fault_inject_init(void)  {  	unsigned int i;  	struct nfsd_fault_inject_op *op; -	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; +	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;  	debug_dir = debugfs_create_dir("nfsd", NULL);  	if (!debug_dir) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 6f3ebb48b12..0e262f32ac4 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -605,24 +605,24 @@ static struct rpc_version nfs_cb_version4 = {  	.procs			= nfs4_cb_procedures  }; -static struct rpc_version *nfs_cb_version[] = { +static const struct rpc_version *nfs_cb_version[] = {  	&nfs_cb_version4,  }; -static struct rpc_program cb_program; +static const struct rpc_program cb_program;  static struct rpc_stat cb_stats = {  	.program		= &cb_program  };  #define NFS4_CALLBACK 0x40000000 -static struct rpc_program cb_program = { +static const struct rpc_program cb_program = {  	.name			= "nfs4_cb",  	.number			= NFS4_CALLBACK,  	.nrvers			= ARRAY_SIZE(nfs_cb_version),  	.version		= nfs_cb_version,  	.stats			= &cb_stats, -	.pipe_dir_name		= "/nfsd4_cb", +	.pipe_dir_name		= "nfsd4_cb",  };  static int max_cb_time(void) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e8c98f00967..c5cddd65942 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1308,7 +1308,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r  	else  		goto out_err; -	conn->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val, +	conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val,  					    se->se_callback_addr_len,  					    (struct sockaddr *)&conn->cb_addr,  					    sizeof(conn->cb_addr)); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 748eda93ce5..64c24af8d7e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -223,7 +223,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)  	if (qword_get(&buf, fo_path, size) < 0)  		return -EINVAL; -	if (rpc_pton(fo_path, size, sap, salen) == 0) +	if (rpc_pton(&init_net, fo_path, size, sap, salen) == 0)  		return -EINVAL;  	return nlmsvc_unlock_all_by_ip(sap); @@ -722,7 +722,7 @@ static ssize_t __write_ports_addxprt(char *buf)  	nfsd_serv->sv_nrthreads--;  	return 0;  out_close: -	xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port); +	xprt = svc_find_xprt(nfsd_serv, transport, &init_net, PF_INET, port);  	if (xprt != NULL) {  		svc_close_xprt(xprt);  		svc_xprt_put(xprt); @@ -748,7 +748,7 @@ static ssize_t __write_ports_delxprt(char *buf)  	if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)  		return -EINVAL; -	xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); +	xprt = svc_find_xprt(nfsd_serv, transport, &init_net, AF_UNSPEC, port);  	if (xprt == NULL)  		return -ENOTCONN; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index eda7d7e55e0..fce472f5f39 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -251,13 +251,13 @@ static void nfsd_shutdown(void)  	nfsd_up = false;  } -static void nfsd_last_thread(struct svc_serv *serv) +static void nfsd_last_thread(struct svc_serv *serv, struct net *net)  {  	/* When last nfsd thread exits we need to do some clean-up */  	nfsd_serv = NULL;  	nfsd_shutdown(); -	svc_rpcb_cleanup(serv); +	svc_rpcb_cleanup(serv, net);  	printk(KERN_WARNING "nfsd: last server has exited, flushing export "  			    "cache\n"); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index a2e2402b2af..6d4521feb6e 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -25,6 +25,7 @@  #include <linux/module.h>  #include <linux/sunrpc/stats.h>  #include <linux/nfsd/stats.h> +#include <net/net_namespace.h>  #include "nfsd.h" @@ -94,11 +95,11 @@ static const struct file_operations nfsd_proc_fops = {  void  nfsd_stat_init(void)  { -	svc_proc_register(&nfsd_svcstats, &nfsd_proc_fops); +	svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops);  }  void  nfsd_stat_shutdown(void)  { -	svc_proc_unregister("nfsd"); +	svc_proc_unregister(&init_net, "nfsd");  } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index edf6d3ed877..e59f71d0cf7 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1541,30 +1541,31 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,  __be32  nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)  { -	struct dentry	*dentry;  	struct inode	*inode;  	mm_segment_t	oldfs;  	__be32		err;  	int		host_err; +	struct path path;  	err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP);  	if (err)  		goto out; -	dentry = fhp->fh_dentry; -	inode = dentry->d_inode; +	path.mnt = fhp->fh_export->ex_path.mnt; +	path.dentry = fhp->fh_dentry; +	inode = path.dentry->d_inode;  	err = nfserr_inval;  	if (!inode->i_op->readlink)  		goto out; -	touch_atime(fhp->fh_export->ex_path.mnt, dentry); +	touch_atime(&path);  	/* N.B. Why does this call need a get_fs()??  	 * Remove the set_fs and watch the fireworks:-) --okir  	 */  	oldfs = get_fs(); set_fs(KERNEL_DS); -	host_err = inode->i_op->readlink(dentry, buf, *lenp); +	host_err = inode->i_op->readlink(path.dentry, buf, *lenp);  	set_fs(oldfs);  	if (host_err < 0) diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index c9b342c8b50..dab5c4c6dfa 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -218,11 +218,11 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile,  								 kaddr, 1);  		mark_buffer_dirty(cp_bh); -		kaddr = kmap_atomic(header_bh->b_page, KM_USER0); +		kaddr = kmap_atomic(header_bh->b_page);  		header = nilfs_cpfile_block_get_header(cpfile, header_bh,  						       kaddr);  		le64_add_cpu(&header->ch_ncheckpoints, 1); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		mark_buffer_dirty(header_bh);  		nilfs_mdt_mark_dirty(cpfile);  	} @@ -313,7 +313,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,  			continue;  		} -		kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); +		kaddr = kmap_atomic(cp_bh->b_page);  		cp = nilfs_cpfile_block_get_checkpoint(  			cpfile, cno, cp_bh, kaddr);  		nicps = 0; @@ -334,7 +334,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,  						cpfile, cp_bh, kaddr, nicps);  				if (count == 0) {  					/* make hole */ -					kunmap_atomic(kaddr, KM_USER0); +					kunmap_atomic(kaddr);  					brelse(cp_bh);  					ret =  					  nilfs_cpfile_delete_checkpoint_block( @@ -349,18 +349,18 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,  			}  		} -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		brelse(cp_bh);  	}  	if (tnicps > 0) { -		kaddr = kmap_atomic(header_bh->b_page, KM_USER0); +		kaddr = kmap_atomic(header_bh->b_page);  		header = nilfs_cpfile_block_get_header(cpfile, header_bh,  						       kaddr);  		le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);  		mark_buffer_dirty(header_bh);  		nilfs_mdt_mark_dirty(cpfile); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  	}  	brelse(header_bh); @@ -408,7 +408,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,  			continue; /* skip hole */  		} -		kaddr = kmap_atomic(bh->b_page, KM_USER0); +		kaddr = kmap_atomic(bh->b_page);  		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);  		for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {  			if (!nilfs_checkpoint_invalid(cp)) { @@ -418,7 +418,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,  				n++;  			}  		} -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		brelse(bh);  	} @@ -451,10 +451,10 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,  		ret = nilfs_cpfile_get_header_block(cpfile, &bh);  		if (ret < 0)  			goto out; -		kaddr = kmap_atomic(bh->b_page, KM_USER0); +		kaddr = kmap_atomic(bh->b_page);  		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);  		curr = le64_to_cpu(header->ch_snapshot_list.ssl_next); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		brelse(bh);  		if (curr == 0) {  			ret = 0; @@ -472,7 +472,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,  			ret = 0; /* No snapshots (started from a hole block) */  		goto out;  	} -	kaddr = kmap_atomic(bh->b_page, KM_USER0); +	kaddr = kmap_atomic(bh->b_page);  	while (n < nci) {  		cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);  		curr = ~(__u64)0; /* Terminator */ @@ -488,7 +488,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,  		next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);  		if (curr_blkoff != next_blkoff) { -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr);  			brelse(bh);  			ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,  								0, &bh); @@ -496,12 +496,12 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,  				WARN_ON(ret == -ENOENT);  				goto out;  			} -			kaddr = kmap_atomic(bh->b_page, KM_USER0); +			kaddr = kmap_atomic(bh->b_page);  		}  		curr = next;  		curr_blkoff = next_blkoff;  	} -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	brelse(bh);  	*cnop = curr;  	ret = n; @@ -592,24 +592,24 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)  	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);  	if (ret < 0)  		goto out_sem; -	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(cp_bh->b_page);  	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);  	if (nilfs_checkpoint_invalid(cp)) {  		ret = -ENOENT; -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		goto out_cp;  	}  	if (nilfs_checkpoint_snapshot(cp)) {  		ret = 0; -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		goto out_cp;  	} -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);  	if (ret < 0)  		goto out_cp; -	kaddr = kmap_atomic(header_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(header_bh->b_page);  	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);  	list = &header->ch_snapshot_list;  	curr_bh = header_bh; @@ -621,13 +621,13 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)  		prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);  		curr = prev;  		if (curr_blkoff != prev_blkoff) { -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr);  			brelse(curr_bh);  			ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,  								0, &curr_bh);  			if (ret < 0)  				goto out_header; -			kaddr = kmap_atomic(curr_bh->b_page, KM_USER0); +			kaddr = kmap_atomic(curr_bh->b_page);  		}  		curr_blkoff = prev_blkoff;  		cp = nilfs_cpfile_block_get_checkpoint( @@ -635,7 +635,7 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)  		list = &cp->cp_snapshot_list;  		prev = le64_to_cpu(list->ssl_prev);  	} -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	if (prev != 0) {  		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, @@ -647,29 +647,29 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)  		get_bh(prev_bh);  	} -	kaddr = kmap_atomic(curr_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(curr_bh->b_page);  	list = nilfs_cpfile_block_get_snapshot_list(  		cpfile, curr, curr_bh, kaddr);  	list->ssl_prev = cpu_to_le64(cno); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr); -	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(cp_bh->b_page);  	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);  	cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);  	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);  	nilfs_checkpoint_set_snapshot(cp); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr); -	kaddr = kmap_atomic(prev_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(prev_bh->b_page);  	list = nilfs_cpfile_block_get_snapshot_list(  		cpfile, prev, prev_bh, kaddr);  	list->ssl_next = cpu_to_le64(cno); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr); -	kaddr = kmap_atomic(header_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(header_bh->b_page);  	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);  	le64_add_cpu(&header->ch_nsnapshots, 1); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	mark_buffer_dirty(prev_bh);  	mark_buffer_dirty(curr_bh); @@ -710,23 +710,23 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)  	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);  	if (ret < 0)  		goto out_sem; -	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(cp_bh->b_page);  	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);  	if (nilfs_checkpoint_invalid(cp)) {  		ret = -ENOENT; -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		goto out_cp;  	}  	if (!nilfs_checkpoint_snapshot(cp)) {  		ret = 0; -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		goto out_cp;  	}  	list = &cp->cp_snapshot_list;  	next = le64_to_cpu(list->ssl_next);  	prev = le64_to_cpu(list->ssl_prev); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);  	if (ret < 0) @@ -750,29 +750,29 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)  		get_bh(prev_bh);  	} -	kaddr = kmap_atomic(next_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(next_bh->b_page);  	list = nilfs_cpfile_block_get_snapshot_list(  		cpfile, next, next_bh, kaddr);  	list->ssl_prev = cpu_to_le64(prev); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr); -	kaddr = kmap_atomic(prev_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(prev_bh->b_page);  	list = nilfs_cpfile_block_get_snapshot_list(  		cpfile, prev, prev_bh, kaddr);  	list->ssl_next = cpu_to_le64(next); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr); -	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(cp_bh->b_page);  	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);  	cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);  	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);  	nilfs_checkpoint_clear_snapshot(cp); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr); -	kaddr = kmap_atomic(header_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(header_bh->b_page);  	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);  	le64_add_cpu(&header->ch_nsnapshots, -1); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	mark_buffer_dirty(next_bh);  	mark_buffer_dirty(prev_bh); @@ -829,13 +829,13 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)  	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);  	if (ret < 0)  		goto out; -	kaddr = kmap_atomic(bh->b_page, KM_USER0); +	kaddr = kmap_atomic(bh->b_page);  	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);  	if (nilfs_checkpoint_invalid(cp))  		ret = -ENOENT;  	else  		ret = nilfs_checkpoint_snapshot(cp); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	brelse(bh);   out: @@ -912,12 +912,12 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)  	ret = nilfs_cpfile_get_header_block(cpfile, &bh);  	if (ret < 0)  		goto out_sem; -	kaddr = kmap_atomic(bh->b_page, KM_USER0); +	kaddr = kmap_atomic(bh->b_page);  	header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);  	cpstat->cs_cno = nilfs_mdt_cno(cpfile);  	cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);  	cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	brelse(bh);   out_sem: diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index fcc2f869af1..b5c13f3576b 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -85,13 +85,13 @@ void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)  	struct nilfs_dat_entry *entry;  	void *kaddr; -	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(req->pr_entry_bh->b_page);  	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,  					     req->pr_entry_bh, kaddr);  	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);  	entry->de_end = cpu_to_le64(NILFS_CNO_MAX);  	entry->de_blocknr = cpu_to_le64(0); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	nilfs_palloc_commit_alloc_entry(dat, req);  	nilfs_dat_commit_entry(dat, req); @@ -109,13 +109,13 @@ static void nilfs_dat_commit_free(struct inode *dat,  	struct nilfs_dat_entry *entry;  	void *kaddr; -	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(req->pr_entry_bh->b_page);  	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,  					     req->pr_entry_bh, kaddr);  	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);  	entry->de_end = cpu_to_le64(NILFS_CNO_MIN);  	entry->de_blocknr = cpu_to_le64(0); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	nilfs_dat_commit_entry(dat, req);  	nilfs_palloc_commit_free_entry(dat, req); @@ -136,12 +136,12 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,  	struct nilfs_dat_entry *entry;  	void *kaddr; -	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(req->pr_entry_bh->b_page);  	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,  					     req->pr_entry_bh, kaddr);  	entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));  	entry->de_blocknr = cpu_to_le64(blocknr); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	nilfs_dat_commit_entry(dat, req);  } @@ -160,12 +160,12 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)  		return ret;  	} -	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(req->pr_entry_bh->b_page);  	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,  					     req->pr_entry_bh, kaddr);  	start = le64_to_cpu(entry->de_start);  	blocknr = le64_to_cpu(entry->de_blocknr); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	if (blocknr == 0) {  		ret = nilfs_palloc_prepare_free_entry(dat, req); @@ -186,7 +186,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,  	sector_t blocknr;  	void *kaddr; -	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(req->pr_entry_bh->b_page);  	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,  					     req->pr_entry_bh, kaddr);  	end = start = le64_to_cpu(entry->de_start); @@ -196,7 +196,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,  	}  	entry->de_end = cpu_to_le64(end);  	blocknr = le64_to_cpu(entry->de_blocknr); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	if (blocknr == 0)  		nilfs_dat_commit_free(dat, req); @@ -211,12 +211,12 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)  	sector_t blocknr;  	void *kaddr; -	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(req->pr_entry_bh->b_page);  	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,  					     req->pr_entry_bh, kaddr);  	start = le64_to_cpu(entry->de_start);  	blocknr = le64_to_cpu(entry->de_blocknr); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	if (start == nilfs_mdt_cno(dat) && blocknr == 0)  		nilfs_palloc_abort_free_entry(dat, req); @@ -346,20 +346,20 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)  		}  	} -	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(entry_bh->b_page);  	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);  	if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {  		printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,  		       (unsigned long long)vblocknr,  		       (unsigned long long)le64_to_cpu(entry->de_start),  		       (unsigned long long)le64_to_cpu(entry->de_end)); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		brelse(entry_bh);  		return -EINVAL;  	}  	WARN_ON(blocknr == 0);  	entry->de_blocknr = cpu_to_le64(blocknr); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	mark_buffer_dirty(entry_bh);  	nilfs_mdt_mark_dirty(dat); @@ -409,7 +409,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)  		}  	} -	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(entry_bh->b_page);  	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);  	blocknr = le64_to_cpu(entry->de_blocknr);  	if (blocknr == 0) { @@ -419,7 +419,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)  	*blocknrp = blocknr;   out: -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	brelse(entry_bh);  	return ret;  } @@ -440,7 +440,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,  						   0, &entry_bh);  		if (ret < 0)  			return ret; -		kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); +		kaddr = kmap_atomic(entry_bh->b_page);  		/* last virtual block number in this block */  		first = vinfo->vi_vblocknr;  		do_div(first, entries_per_block); @@ -456,7 +456,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,  			vinfo->vi_end = le64_to_cpu(entry->de_end);  			vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);  		} -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		brelse(entry_bh);  	} diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index ca35b3a46d1..df1a7fb238d 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -602,7 +602,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)  		unlock_page(page);  		goto fail;  	} -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	memset(kaddr, 0, chunk_size);  	de = (struct nilfs_dir_entry *)kaddr;  	de->name_len = 1; @@ -617,7 +617,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)  	de->inode = cpu_to_le64(parent->i_ino);  	memcpy(de->name, "..\0", 4);  	nilfs_set_de_type(de, inode); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	nilfs_commit_chunk(page, mapping, 0, chunk_size);  fail:  	page_cache_release(page); diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 684d76300a8..5a48df79d67 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -122,11 +122,11 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)  		return ret;  	} -	kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(req.pr_entry_bh->b_page);  	raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,  						 req.pr_entry_bh, kaddr);  	raw_inode->i_flags = 0; -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	mark_buffer_dirty(req.pr_entry_bh);  	brelse(req.pr_entry_bh); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 886649627c3..2a70fce70c6 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -603,6 +603,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,  	nsegs = argv[4].v_nmembs;  	if (argv[4].v_size != argsz[4])  		goto out; +	if (nsegs > UINT_MAX / sizeof(__u64)) +		goto out;  	/*  	 * argv[4] points to segment numbers this ioctl cleans.  We diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 800e8d78a83..f9897d09c69 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -58,12 +58,12 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,  	set_buffer_mapped(bh); -	kaddr = kmap_atomic(bh->b_page, KM_USER0); +	kaddr = kmap_atomic(bh->b_page);  	memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);  	if (init_block)  		init_block(inode, bh, kaddr);  	flush_dcache_page(bh->b_page); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	set_buffer_uptodate(bh);  	mark_buffer_dirty(bh); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 1cd3f624dff..fce2bbee66d 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -193,9 +193,6 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,  	struct nilfs_transaction_info ti;  	int err; -	if (inode->i_nlink >= NILFS_LINK_MAX) -		return -EMLINK; -  	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);  	if (err)  		return err; @@ -219,9 +216,6 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  	struct nilfs_transaction_info ti;  	int err; -	if (dir->i_nlink >= NILFS_LINK_MAX) -		return -EMLINK; -  	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);  	if (err)  		return err; @@ -400,11 +394,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,  		drop_nlink(new_inode);  		nilfs_mark_inode_dirty(new_inode);  	} else { -		if (dir_de) { -			err = -EMLINK; -			if (new_dir->i_nlink >= NILFS_LINK_MAX) -				goto out_dir; -		}  		err = nilfs_add_link(new_dentry, old_inode);  		if (err)  			goto out_dir; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 65221a04c6f..3e7b2a0dc0c 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -119,11 +119,11 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)  	struct page *spage = sbh->b_page, *dpage = dbh->b_page;  	struct buffer_head *bh; -	kaddr0 = kmap_atomic(spage, KM_USER0); -	kaddr1 = kmap_atomic(dpage, KM_USER1); +	kaddr0 = kmap_atomic(spage); +	kaddr1 = kmap_atomic(dpage);  	memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size); -	kunmap_atomic(kaddr1, KM_USER1); -	kunmap_atomic(kaddr0, KM_USER0); +	kunmap_atomic(kaddr1); +	kunmap_atomic(kaddr0);  	dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;  	dbh->b_blocknr = sbh->b_blocknr; diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index a604ac0331b..f1626f5011c 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -493,9 +493,9 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,  	if (unlikely(!bh_org))  		return -EIO; -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	brelse(bh_org);  	return 0;  } diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 850a7c0228f..dc9a913784a 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -227,9 +227,9 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,  		crc = crc32_le(crc, bh->b_data, bh->b_size);  	}  	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { -		kaddr = kmap_atomic(bh->b_page, KM_USER0); +		kaddr = kmap_atomic(bh->b_page);  		crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  	}  	raw_sum->ss_datasum = cpu_to_le32(crc);  } diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 0a0aba617d8..c5b7653a439 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -111,11 +111,11 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,  	struct nilfs_sufile_header *header;  	void *kaddr; -	kaddr = kmap_atomic(header_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(header_bh->b_page);  	header = kaddr + bh_offset(header_bh);  	le64_add_cpu(&header->sh_ncleansegs, ncleanadd);  	le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	mark_buffer_dirty(header_bh);  } @@ -319,11 +319,11 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)  	ret = nilfs_sufile_get_header_block(sufile, &header_bh);  	if (ret < 0)  		goto out_sem; -	kaddr = kmap_atomic(header_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(header_bh->b_page);  	header = kaddr + bh_offset(header_bh);  	ncleansegs = le64_to_cpu(header->sh_ncleansegs);  	last_alloc = le64_to_cpu(header->sh_last_alloc); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	nsegments = nilfs_sufile_get_nsegments(sufile);  	maxsegnum = sui->allocmax; @@ -356,7 +356,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)  							   &su_bh);  		if (ret < 0)  			goto out_header; -		kaddr = kmap_atomic(su_bh->b_page, KM_USER0); +		kaddr = kmap_atomic(su_bh->b_page);  		su = nilfs_sufile_block_get_segment_usage(  			sufile, segnum, su_bh, kaddr); @@ -367,14 +367,14 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)  				continue;  			/* found a clean segment */  			nilfs_segment_usage_set_dirty(su); -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr); -			kaddr = kmap_atomic(header_bh->b_page, KM_USER0); +			kaddr = kmap_atomic(header_bh->b_page);  			header = kaddr + bh_offset(header_bh);  			le64_add_cpu(&header->sh_ncleansegs, -1);  			le64_add_cpu(&header->sh_ndirtysegs, 1);  			header->sh_last_alloc = cpu_to_le64(segnum); -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr);  			sui->ncleansegs--;  			mark_buffer_dirty(header_bh); @@ -385,7 +385,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)  			goto out_header;  		} -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		brelse(su_bh);  	} @@ -407,16 +407,16 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,  	struct nilfs_segment_usage *su;  	void *kaddr; -	kaddr = kmap_atomic(su_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(su_bh->b_page);  	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);  	if (unlikely(!nilfs_segment_usage_clean(su))) {  		printk(KERN_WARNING "%s: segment %llu must be clean\n",  		       __func__, (unsigned long long)segnum); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		return;  	}  	nilfs_segment_usage_set_dirty(su); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	nilfs_sufile_mod_counter(header_bh, -1, 1);  	NILFS_SUI(sufile)->ncleansegs--; @@ -433,11 +433,11 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,  	void *kaddr;  	int clean, dirty; -	kaddr = kmap_atomic(su_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(su_bh->b_page);  	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);  	if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&  	    su->su_nblocks == cpu_to_le32(0)) { -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		return;  	}  	clean = nilfs_segment_usage_clean(su); @@ -447,7 +447,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,  	su->su_lastmod = cpu_to_le64(0);  	su->su_nblocks = cpu_to_le32(0);  	su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);  	NILFS_SUI(sufile)->ncleansegs -= clean; @@ -464,12 +464,12 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,  	void *kaddr;  	int sudirty; -	kaddr = kmap_atomic(su_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(su_bh->b_page);  	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);  	if (nilfs_segment_usage_clean(su)) {  		printk(KERN_WARNING "%s: segment %llu is already clean\n",  		       __func__, (unsigned long long)segnum); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		return;  	}  	WARN_ON(nilfs_segment_usage_error(su)); @@ -477,7 +477,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,  	sudirty = nilfs_segment_usage_dirty(su);  	nilfs_segment_usage_set_clean(su); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	mark_buffer_dirty(su_bh);  	nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); @@ -525,13 +525,13 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,  	if (ret < 0)  		goto out_sem; -	kaddr = kmap_atomic(bh->b_page, KM_USER0); +	kaddr = kmap_atomic(bh->b_page);  	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);  	WARN_ON(nilfs_segment_usage_error(su));  	if (modtime)  		su->su_lastmod = cpu_to_le64(modtime);  	su->su_nblocks = cpu_to_le32(nblocks); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	mark_buffer_dirty(bh);  	nilfs_mdt_mark_dirty(sufile); @@ -572,7 +572,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)  	if (ret < 0)  		goto out_sem; -	kaddr = kmap_atomic(header_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(header_bh->b_page);  	header = kaddr + bh_offset(header_bh);  	sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);  	sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); @@ -582,7 +582,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)  	spin_lock(&nilfs->ns_last_segment_lock);  	sustat->ss_prot_seq = nilfs->ns_prot_seq;  	spin_unlock(&nilfs->ns_last_segment_lock); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	brelse(header_bh);   out_sem: @@ -598,15 +598,15 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,  	void *kaddr;  	int suclean; -	kaddr = kmap_atomic(su_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(su_bh->b_page);  	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);  	if (nilfs_segment_usage_error(su)) { -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		return;  	}  	suclean = nilfs_segment_usage_clean(su);  	nilfs_segment_usage_set_error(su); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	if (suclean) {  		nilfs_sufile_mod_counter(header_bh, -1, 0); @@ -675,7 +675,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,  			/* hole */  			continue;  		} -		kaddr = kmap_atomic(su_bh->b_page, KM_USER0); +		kaddr = kmap_atomic(su_bh->b_page);  		su = nilfs_sufile_block_get_segment_usage(  			sufile, segnum, su_bh, kaddr);  		su2 = su; @@ -684,7 +684,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,  			     ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) ||  			    nilfs_segment_is_active(nilfs, segnum + j)) {  				ret = -EBUSY; -				kunmap_atomic(kaddr, KM_USER0); +				kunmap_atomic(kaddr);  				brelse(su_bh);  				goto out_header;  			} @@ -696,7 +696,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,  				nc++;  			}  		} -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		if (nc > 0) {  			mark_buffer_dirty(su_bh);  			ncleaned += nc; @@ -772,10 +772,10 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)  		sui->ncleansegs -= nsegs - newnsegs;  	} -	kaddr = kmap_atomic(header_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(header_bh->b_page);  	header = kaddr + bh_offset(header_bh);  	header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	mark_buffer_dirty(header_bh);  	nilfs_mdt_mark_dirty(sufile); @@ -840,7 +840,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,  			continue;  		} -		kaddr = kmap_atomic(su_bh->b_page, KM_USER0); +		kaddr = kmap_atomic(su_bh->b_page);  		su = nilfs_sufile_block_get_segment_usage(  			sufile, segnum, su_bh, kaddr);  		for (j = 0; j < n; @@ -853,7 +853,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,  				si->sui_flags |=  					(1UL << NILFS_SEGMENT_USAGE_ACTIVE);  		} -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		brelse(su_bh);  	}  	ret = nsegs; @@ -902,10 +902,10 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,  		goto failed;  	sui = NILFS_SUI(sufile); -	kaddr = kmap_atomic(header_bh->b_page, KM_USER0); +	kaddr = kmap_atomic(header_bh->b_page);  	header = kaddr + bh_offset(header_bh);  	sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	brelse(header_bh);  	sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 08e3d4f9df1..1099a76cee5 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -917,9 +917,8 @@ static int nilfs_get_root_dentry(struct super_block *sb,  	if (root->cno == NILFS_CPTREE_CURRENT_CNO) {  		dentry = d_find_alias(inode);  		if (!dentry) { -			dentry = d_alloc_root(inode); +			dentry = d_make_root(inode);  			if (!dentry) { -				iput(inode);  				ret = -ENOMEM;  				goto failed_dentry;  			} @@ -1059,6 +1058,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)  	sb->s_export_op = &nilfs_export_ops;  	sb->s_root = NULL;  	sb->s_time_gran = 1; +	sb->s_max_links = NILFS_LINK_MAX;  	bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;  	sb->s_bdi = bdi ? : &default_backing_dev_info; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index d3271409437..501b7f8b739 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -409,6 +409,12 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,  	nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);  	nilfs->ns_r_segments_percentage =  		le32_to_cpu(sbp->s_r_segments_percentage); +	if (nilfs->ns_r_segments_percentage < 1 || +	    nilfs->ns_r_segments_percentage > 99) { +		printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n"); +		return -EINVAL; +	} +  	nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));  	nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);  	return 0; @@ -515,6 +521,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,  		brelse(sbh[1]);  		sbh[1] = NULL;  		sbp[1] = NULL; +		valid[1] = 0;  		swp = 0;  	}  	if (!valid[swp]) { diff --git a/fs/notify/notification.c b/fs/notify/notification.c index ee188158a22..c887b1378f7 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -447,7 +447,7 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,  	return event;  } -__init int fsnotify_notification_init(void) +static __init int fsnotify_notification_init(void)  {  	fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);  	fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); @@ -461,4 +461,3 @@ __init int fsnotify_notification_init(void)  	return 0;  }  subsys_initcall(fsnotify_notification_init); - diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 0b1e885b8cf..fa9c05f97af 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -94,11 +94,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)  			if (file_ofs < init_size)  				ofs = init_size - file_ofs;  			local_irq_save(flags); -			kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); +			kaddr = kmap_atomic(page);  			memset(kaddr + bh_offset(bh) + ofs, 0,  					bh->b_size - ofs);  			flush_dcache_page(page); -			kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); +			kunmap_atomic(kaddr);  			local_irq_restore(flags);  		}  	} else { @@ -147,11 +147,11 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)  		/* Should have been verified before we got here... */  		BUG_ON(!recs);  		local_irq_save(flags); -		kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); +		kaddr = kmap_atomic(page);  		for (i = 0; i < recs; i++)  			post_read_mst_fixup((NTFS_RECORD*)(kaddr +  					i * rec_size), rec_size); -		kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); +		kunmap_atomic(kaddr);  		local_irq_restore(flags);  		flush_dcache_page(page);  		if (likely(page_uptodate && !PageError(page))) @@ -504,7 +504,7 @@ retry_readpage:  		/* Race with shrinking truncate. */  		attr_len = i_size;  	} -	addr = kmap_atomic(page, KM_USER0); +	addr = kmap_atomic(page);  	/* Copy the data to the page. */  	memcpy(addr, (u8*)ctx->attr +  			le16_to_cpu(ctx->attr->data.resident.value_offset), @@ -512,7 +512,7 @@ retry_readpage:  	/* Zero the remainder of the page. */  	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);  	flush_dcache_page(page); -	kunmap_atomic(addr, KM_USER0); +	kunmap_atomic(addr);  put_unm_err_out:  	ntfs_attr_put_search_ctx(ctx);  unm_err_out: @@ -746,14 +746,14 @@ lock_retry_remap:  			unsigned long *bpos, *bend;  			/* Check if the buffer is zero. */ -			kaddr = kmap_atomic(page, KM_USER0); +			kaddr = kmap_atomic(page);  			bpos = (unsigned long *)(kaddr + bh_offset(bh));  			bend = (unsigned long *)((u8*)bpos + blocksize);  			do {  				if (unlikely(*bpos))  					break;  			} while (likely(++bpos < bend)); -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr);  			if (bpos == bend) {  				/*  				 * Buffer is zero and sparse, no need to write @@ -1495,14 +1495,14 @@ retry_writepage:  		/* Shrinking cannot fail. */  		BUG_ON(err);  	} -	addr = kmap_atomic(page, KM_USER0); +	addr = kmap_atomic(page);  	/* Copy the data from the page to the mft record. */  	memcpy((u8*)ctx->attr +  			le16_to_cpu(ctx->attr->data.resident.value_offset),  			addr, attr_len);  	/* Zero out of bounds area in the page cache page. */  	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len); -	kunmap_atomic(addr, KM_USER0); +	kunmap_atomic(addr);  	flush_dcache_page(page);  	flush_dcache_mft_record_page(ctx->ntfs_ino);  	/* We are done with the page. */ diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index f14fde2b03d..a27e3fecefa 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1,7 +1,7 @@  /**   * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project.   * - * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.   * Copyright (c) 2002 Richard Russon   *   * This program/include file is free software; you can redistribute it and/or @@ -345,10 +345,10 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,  	unsigned long flags;  	bool is_retry = false; +	BUG_ON(!ni);  	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",  			ni->mft_no, (unsigned long long)vcn,  			write_locked ? "write" : "read"); -	BUG_ON(!ni);  	BUG_ON(!NInoNonResident(ni));  	BUG_ON(vcn < 0);  	if (!ni->runlist.rl) { @@ -469,9 +469,9 @@ runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,  	int err = 0;  	bool is_retry = false; +	BUG_ON(!ni);  	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",  			ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out"); -	BUG_ON(!ni);  	BUG_ON(!NInoNonResident(ni));  	BUG_ON(vcn < 0);  	if (!ni->runlist.rl) { @@ -1656,12 +1656,12 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)  	attr_size = le32_to_cpu(a->data.resident.value_length);  	BUG_ON(attr_size != data_size);  	if (page && !PageUptodate(page)) { -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = kmap_atomic(page);  		memcpy(kaddr, (u8*)a +  				le16_to_cpu(a->data.resident.value_offset),  				attr_size);  		memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		flush_dcache_page(page);  		SetPageUptodate(page);  	} @@ -1806,9 +1806,9 @@ undo_err_out:  			sizeof(a->data.resident.reserved));  	/* Copy the data from the page back to the attribute value. */  	if (page) { -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = kmap_atomic(page);  		memcpy((u8*)a + mp_ofs, kaddr, attr_size); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  	}  	/* Setup the allocated size in the ntfs inode in case it changed. */  	write_lock_irqsave(&ni->size_lock, flags); @@ -2540,10 +2540,10 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)  		size = PAGE_CACHE_SIZE;  		if (idx == end)  			size = end_ofs; -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = kmap_atomic(page);  		memset(kaddr + start_ofs, val, size - start_ofs);  		flush_dcache_page(page); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		set_page_dirty(page);  		page_cache_release(page);  		balance_dirty_pages_ratelimited(mapping); @@ -2561,10 +2561,10 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)  					"page (index 0x%lx).", idx);  			return -ENOMEM;  		} -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = kmap_atomic(page);  		memset(kaddr, val, PAGE_CACHE_SIZE);  		flush_dcache_page(page); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		/*  		 * If the page has buffers, mark them uptodate since buffer  		 * state and not page state is definitive in 2.6 kernels. @@ -2598,10 +2598,10 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)  					"(error, index 0x%lx).", idx);  			return PTR_ERR(page);  		} -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = kmap_atomic(page);  		memset(kaddr, val, end_ofs);  		flush_dcache_page(page); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		set_page_dirty(page);  		page_cache_release(page);  		balance_dirty_pages_ratelimited(mapping); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index c587e2d2718..8639169221c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -704,7 +704,7 @@ map_buffer_cached:  				u8 *kaddr;  				unsigned pofs; -				kaddr = kmap_atomic(page, KM_USER0); +				kaddr = kmap_atomic(page);  				if (bh_pos < pos) {  					pofs = bh_pos & ~PAGE_CACHE_MASK;  					memset(kaddr + pofs, 0, pos - bh_pos); @@ -713,7 +713,7 @@ map_buffer_cached:  					pofs = end & ~PAGE_CACHE_MASK;  					memset(kaddr + pofs, 0, bh_end - end);  				} -				kunmap_atomic(kaddr, KM_USER0); +				kunmap_atomic(kaddr);  				flush_dcache_page(page);  			}  			continue; @@ -1287,9 +1287,9 @@ static inline size_t ntfs_copy_from_user(struct page **pages,  		len = PAGE_CACHE_SIZE - ofs;  		if (len > bytes)  			len = bytes; -		addr = kmap_atomic(*pages, KM_USER0); +		addr = kmap_atomic(*pages);  		left = __copy_from_user_inatomic(addr + ofs, buf, len); -		kunmap_atomic(addr, KM_USER0); +		kunmap_atomic(addr);  		if (unlikely(left)) {  			/* Do it the slow way. */  			addr = kmap(*pages); @@ -1401,10 +1401,10 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,  		len = PAGE_CACHE_SIZE - ofs;  		if (len > bytes)  			len = bytes; -		addr = kmap_atomic(*pages, KM_USER0); +		addr = kmap_atomic(*pages);  		copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,  				*iov, *iov_ofs, len); -		kunmap_atomic(addr, KM_USER0); +		kunmap_atomic(addr);  		if (unlikely(copied != len)) {  			/* Do it the slow way. */  			addr = kmap(*pages); @@ -1691,7 +1691,7 @@ static int ntfs_commit_pages_after_write(struct page **pages,  	BUG_ON(end > le32_to_cpu(a->length) -  			le16_to_cpu(a->data.resident.value_offset));  	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	/* Copy the received data from the page to the mft record. */  	memcpy(kattr + pos, kaddr + pos, bytes);  	/* Update the attribute length if necessary. */ @@ -1713,7 +1713,7 @@ static int ntfs_commit_pages_after_write(struct page **pages,  		flush_dcache_page(page);  		SetPageUptodate(page);  	} -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	/* Update initialized_size/i_size if necessary. */  	read_lock_irqsave(&ni->size_lock, flags);  	initialized_size = ni->initialized_size; diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index faece719086..809c0e6d8e0 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -2008,14 +2008,14 @@ typedef struct {   *   * When a directory is small enough to fit inside the index root then this   * is the only attribute describing the directory. When the directory is too - * large to fit in the index root, on the other hand, two aditional attributes + * large to fit in the index root, on the other hand, two additional attributes   * are present: an index allocation attribute, containing sub-nodes of the B+   * directory tree (see below), and a bitmap attribute, describing which virtual   * cluster numbers (vcns) in the index allocation attribute are in use by an   * index block.   *   * NOTE: The root directory (FILE_root) contains an entry for itself. Other - * dircetories do not contain entries for themselves, though. + * directories do not contain entries for themselves, though.   */  typedef struct {  	ATTR_TYPE type;			/* Type of the indexed attribute. Is diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 382857f9c7d..3014a36a255 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1,7 +1,7 @@  /**   * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.   * - * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.   * Copyright (c) 2002 Richard Russon   *   * This program/include file is free software; you can redistribute it and/or @@ -1367,7 +1367,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)  			ntfs_error(vol->sb, "Failed to merge runlists for mft "  					"bitmap.");  			if (ntfs_cluster_free_from_rl(vol, rl2)) { -				ntfs_error(vol->sb, "Failed to dealocate " +				ntfs_error(vol->sb, "Failed to deallocate "  						"allocated cluster.%s", es);  				NVolSetErrors(vol);  			} @@ -1805,7 +1805,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)  		ntfs_error(vol->sb, "Failed to merge runlists for mft data "  				"attribute.");  		if (ntfs_cluster_free_from_rl(vol, rl2)) { -			ntfs_error(vol->sb, "Failed to dealocate clusters " +			ntfs_error(vol->sb, "Failed to deallocate clusters "  					"from the mft data attribute.%s", es);  			NVolSetErrors(vol);  		} diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 5a4a8af5c40..b341492542c 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1,7 +1,7 @@  /*   * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.   * - * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.   * Copyright (c) 2001,2002 Richard Russon   *   * This program/include file is free software; you can redistribute it and/or @@ -1239,7 +1239,6 @@ static int check_windows_hibernation_status(ntfs_volume *vol)  {  	MFT_REF mref;  	struct inode *vi; -	ntfs_inode *ni;  	struct page *page;  	u32 *kaddr, *kend;  	ntfs_name *name = NULL; @@ -1290,7 +1289,6 @@ static int check_windows_hibernation_status(ntfs_volume *vol)  				"is not the system volume.", i_size_read(vi));  		goto iput_out;  	} -	ni = NTFS_I(vi);  	page = ntfs_map_page(vi->i_mapping, 0);  	if (IS_ERR(page)) {  		ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); @@ -2475,7 +2473,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)  			nr_free -= PAGE_CACHE_SIZE * 8;  			continue;  		} -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = kmap_atomic(page);  		/*  		 * Subtract the number of set bits. If this  		 * is the last page and it is partial we don't really care as @@ -2485,7 +2483,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)  		 */  		nr_free -= bitmap_weight(kaddr,  					PAGE_CACHE_SIZE * BITS_PER_BYTE); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		page_cache_release(page);  	}  	ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1); @@ -2546,7 +2544,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,  			nr_free -= PAGE_CACHE_SIZE * 8;  			continue;  		} -		kaddr = kmap_atomic(page, KM_USER0); +		kaddr = kmap_atomic(page);  		/*  		 * Subtract the number of set bits. If this  		 * is the last page and it is partial we don't really care as @@ -2556,7 +2554,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,  		 */  		nr_free -= bitmap_weight(kaddr,  					PAGE_CACHE_SIZE * BITS_PER_BYTE); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  		page_cache_release(page);  	}  	ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.", @@ -2910,9 +2908,10 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)  		ntfs_error(sb, "Failed to load system files.");  		goto unl_upcase_iput_tmp_ino_err_out_now;  	} -	if ((sb->s_root = d_alloc_root(vol->root_ino))) { -		/* We grab a reference, simulating an ntfs_iget(). */ -		ihold(vol->root_ino); + +	/* We grab a reference, simulating an ntfs_iget(). */ +	ihold(vol->root_ino); +	if ((sb->s_root = d_make_root(vol->root_ino))) {  		ntfs_debug("Exiting, status successful.");  		/* Release the default upcase if it has no users. */  		mutex_lock(&ntfs_lock); @@ -3160,6 +3159,8 @@ static int __init init_ntfs_fs(void)  	}  	printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n"); +	/* Unregister the ntfs sysctls. */ +	ntfs_sysctl(0);  sysctl_err_out:  	kmem_cache_destroy(ntfs_big_inode_cache);  big_inode_err_out: diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 78b68af3b0e..657743254eb 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -102,7 +102,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,  		 * copy, the data is still good. */  		if (buffer_jbd(buffer_cache_bh)  		    && ocfs2_inode_is_new(inode)) { -			kaddr = kmap_atomic(bh_result->b_page, KM_USER0); +			kaddr = kmap_atomic(bh_result->b_page);  			if (!kaddr) {  				mlog(ML_ERROR, "couldn't kmap!\n");  				goto bail; @@ -110,7 +110,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,  			memcpy(kaddr + (bh_result->b_size * iblock),  			       buffer_cache_bh->b_data,  			       bh_result->b_size); -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr);  			set_buffer_uptodate(bh_result);  		}  		brelse(buffer_cache_bh); @@ -236,13 +236,13 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,  		return -EROFS;  	} -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	if (size)  		memcpy(kaddr, di->id2.i_data.id_data, size);  	/* Clear the remaining part of the page */  	memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);  	flush_dcache_page(page); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	SetPageUptodate(page); @@ -689,7 +689,7 @@ static void ocfs2_clear_page_regions(struct page *page,  	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	if (from || to) {  		if (from > cluster_start) @@ -700,7 +700,7 @@ static void ocfs2_clear_page_regions(struct page *page,  		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);  	} -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  }  /* @@ -1981,9 +1981,9 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,  		}  	} -	kaddr = kmap_atomic(wc->w_target_page, KM_USER0); +	kaddr = kmap_atomic(wc->w_target_page);  	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	trace_ocfs2_write_end_inline(  	     (unsigned long long)OCFS2_I(inode)->ip_blkno, diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index abfac0d7ae9..3b5825ef319 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -582,24 +582,14 @@ static int dlmfs_fill_super(struct super_block * sb,  			    void * data,  			    int silent)  { -	struct inode * inode; -	struct dentry * root; -  	sb->s_maxbytes = MAX_LFS_FILESIZE;  	sb->s_blocksize = PAGE_CACHE_SIZE;  	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;  	sb->s_magic = DLMFS_MAGIC;  	sb->s_op = &dlmfs_ops; -	inode = dlmfs_get_root_inode(sb); -	if (!inode) -		return -ENOMEM; - -	root = d_alloc_root(inode); -	if (!root) { -		iput(inode); +	sb->s_root = d_make_root(dlmfs_get_root_inode(sb)); +	if (!sb->s_root)  		return -ENOMEM; -	} -	sb->s_root = root;  	return 0;  } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index be244692550..a9856e3eaaf 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1053,7 +1053,7 @@ static int ocfs2_rename(struct inode *old_dir,  	handle_t *handle = NULL;  	struct buffer_head *old_dir_bh = NULL;  	struct buffer_head *new_dir_bh = NULL; -	nlink_t old_dir_nlink = old_dir->i_nlink; +	u32 old_dir_nlink = old_dir->i_nlink;  	struct ocfs2_dinode *old_di;  	struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };  	struct ocfs2_dir_lookup_result target_lookup_res = { NULL, }; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 604e12c4e97..68f4541c2db 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1154,19 +1154,19 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)  	}  	status = ocfs2_mount_volume(sb); -	if (osb->root_inode) -		inode = igrab(osb->root_inode); -  	if (status < 0)  		goto read_super_error; +	if (osb->root_inode) +		inode = igrab(osb->root_inode); +  	if (!inode) {  		status = -EIO;  		mlog_errno(status);  		goto read_super_error;  	} -	root = d_alloc_root(inode); +	root = d_make_root(inode);  	if (!root) {  		status = -ENOMEM;  		mlog_errno(status); @@ -1220,9 +1220,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)  read_super_error:  	brelse(bh); -	if (inode) -		iput(inode); -  	if (osb) {  		atomic_set(&osb->vol_state, VOLUME_DISABLED);  		wake_up(&osb->osb_mount_event); @@ -1627,21 +1624,17 @@ static int __init ocfs2_init(void)  		init_waitqueue_head(&ocfs2__ioend_wq[i]);  	status = init_ocfs2_uptodate_cache(); -	if (status < 0) { -		mlog_errno(status); -		goto leave; -	} +	if (status < 0) +		goto out1;  	status = ocfs2_initialize_mem_caches(); -	if (status < 0) { -		mlog_errno(status); -		goto leave; -	} +	if (status < 0) +		goto out2;  	ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");  	if (!ocfs2_wq) {  		status = -ENOMEM; -		goto leave; +		goto out3;  	}  	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); @@ -1653,17 +1646,23 @@ static int __init ocfs2_init(void)  	ocfs2_set_locking_protocol();  	status = register_quota_format(&ocfs2_quota_format); -leave: -	if (status < 0) { -		ocfs2_free_mem_caches(); -		exit_ocfs2_uptodate_cache(); -		mlog_errno(status); -	} +	if (status < 0) +		goto out4; +	status = register_filesystem(&ocfs2_fs_type); +	if (!status) +		return 0; -	if (status >= 0) { -		return register_filesystem(&ocfs2_fs_type); -	} else -		return -1; +	unregister_quota_format(&ocfs2_quota_format); +out4: +	destroy_workqueue(ocfs2_wq); +	debugfs_remove(ocfs2_debugfs_root); +out3: +	ocfs2_free_mem_caches(); +out2: +	exit_ocfs2_uptodate_cache(); +out1: +	mlog_errno(status); +	return status;  }  static void __exit ocfs2_exit(void) diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 6065bb0ba20..dbc84222258 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -539,11 +539,9 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)  		goto out_brelse_bh2;  	} -	sb->s_root = d_alloc_root(root); -	if (!sb->s_root) { -		iput(root); +	sb->s_root = d_make_root(root); +	if (!sb->s_root)  		goto out_brelse_bh2; -	}  	printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name);  	ret = 0; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index a88c03bc749..bc49c975d50 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -408,13 +408,12 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)  	oi->type = op_inode_node;  	oi->u.node = of_find_node_by_path("/"); -	s->s_root = d_alloc_root(root_inode); +	s->s_root = d_make_root(root_inode);  	if (!s->s_root)  		goto out_no_root_dentry;  	return 0;  out_no_root_dentry: -	iput(root_inode);  	ret = -ENOMEM;  out_no_root:  	printk("openprom_fill_super: get root inode failed\n"); diff --git a/fs/pipe.c b/fs/pipe.c index a932ced92a1..25feaa3faac 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -13,6 +13,7 @@  #include <linux/fs.h>  #include <linux/log2.h>  #include <linux/mount.h> +#include <linux/magic.h>  #include <linux/pipe_fs_i.h>  #include <linux/uio.h>  #include <linux/highmem.h> @@ -230,7 +231,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,  {  	if (atomic) {  		buf->flags |= PIPE_BUF_FLAG_ATOMIC; -		return kmap_atomic(buf->page, KM_USER0); +		return kmap_atomic(buf->page);  	}  	return kmap(buf->page); @@ -251,7 +252,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,  {  	if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {  		buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; -		kunmap_atomic(map_data, KM_USER0); +		kunmap_atomic(map_data);  	} else  		kunmap(buf->page);  } @@ -565,14 +566,14 @@ redo1:  			iov_fault_in_pages_read(iov, chars);  redo2:  			if (atomic) -				src = kmap_atomic(page, KM_USER0); +				src = kmap_atomic(page);  			else  				src = kmap(page);  			error = pipe_iov_copy_from_user(src, iov, chars,  							atomic);  			if (atomic) -				kunmap_atomic(src, KM_USER0); +				kunmap_atomic(src);  			else  				kunmap(page); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index cea4623f1ed..5e325a42e33 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -18,7 +18,7 @@  #include <linux/fs.h>  #include <linux/sched.h>  #include <linux/posix_acl.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/errno.h> diff --git a/fs/proc/array.c b/fs/proc/array.c index c602b8d20f0..fbb53c24908 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -462,59 +462,56 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,  	/* convert nsec -> ticks */  	start_time = nsec_to_clock_t(start_time); -	seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ -%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", -		pid_nr_ns(pid, ns), -		tcomm, -		state, -		ppid, -		pgid, -		sid, -		tty_nr, -		tty_pgrp, -		task->flags, -		min_flt, -		cmin_flt, -		maj_flt, -		cmaj_flt, -		cputime_to_clock_t(utime), -		cputime_to_clock_t(stime), -		cputime_to_clock_t(cutime), -		cputime_to_clock_t(cstime), -		priority, -		nice, -		num_threads, -		start_time, -		vsize, -		mm ? get_mm_rss(mm) : 0, -		rsslim, -		mm ? (permitted ? mm->start_code : 1) : 0, -		mm ? (permitted ? mm->end_code : 1) : 0, -		(permitted && mm) ? mm->start_stack : 0, -		esp, -		eip, -		/* The signal information here is obsolete. -		 * It must be decimal for Linux 2.0 compatibility. -		 * Use /proc/#/status for real-time signals. -		 */ -		task->pending.signal.sig[0] & 0x7fffffffUL, -		task->blocked.sig[0] & 0x7fffffffUL, -		sigign      .sig[0] & 0x7fffffffUL, -		sigcatch    .sig[0] & 0x7fffffffUL, -		wchan, -		0UL, -		0UL, -		task->exit_signal, -		task_cpu(task), -		task->rt_priority, -		task->policy, -		(unsigned long long)delayacct_blkio_ticks(task), -		cputime_to_clock_t(gtime), -		cputime_to_clock_t(cgtime), -		(mm && permitted) ? mm->start_data : 0, -		(mm && permitted) ? mm->end_data : 0, -		(mm && permitted) ? mm->start_brk : 0); +	seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); +	seq_put_decimal_ll(m, ' ', ppid); +	seq_put_decimal_ll(m, ' ', pgid); +	seq_put_decimal_ll(m, ' ', sid); +	seq_put_decimal_ll(m, ' ', tty_nr); +	seq_put_decimal_ll(m, ' ', tty_pgrp); +	seq_put_decimal_ull(m, ' ', task->flags); +	seq_put_decimal_ull(m, ' ', min_flt); +	seq_put_decimal_ull(m, ' ', cmin_flt); +	seq_put_decimal_ull(m, ' ', maj_flt); +	seq_put_decimal_ull(m, ' ', cmaj_flt); +	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime)); +	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime)); +	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime)); +	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime)); +	seq_put_decimal_ll(m, ' ', priority); +	seq_put_decimal_ll(m, ' ', nice); +	seq_put_decimal_ll(m, ' ', num_threads); +	seq_put_decimal_ull(m, ' ', 0); +	seq_put_decimal_ull(m, ' ', start_time); +	seq_put_decimal_ull(m, ' ', vsize); +	seq_put_decimal_ll(m, ' ', mm ? get_mm_rss(mm) : 0); +	seq_put_decimal_ull(m, ' ', rsslim); +	seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0); +	seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0); +	seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0); +	seq_put_decimal_ull(m, ' ', esp); +	seq_put_decimal_ull(m, ' ', eip); +	/* The signal information here is obsolete. +	 * It must be decimal for Linux 2.0 compatibility. +	 * Use /proc/#/status for real-time signals. +	 */ +	seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL); +	seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); +	seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); +	seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); +	seq_put_decimal_ull(m, ' ', wchan); +	seq_put_decimal_ull(m, ' ', 0); +	seq_put_decimal_ull(m, ' ', 0); +	seq_put_decimal_ll(m, ' ', task->exit_signal); +	seq_put_decimal_ll(m, ' ', task_cpu(task)); +	seq_put_decimal_ull(m, ' ', task->rt_priority); +	seq_put_decimal_ull(m, ' ', task->policy); +	seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task)); +	seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime)); +	seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime)); +	seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_data : 0); +	seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->end_data : 0); +	seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_brk : 0); +	seq_putc(m, '\n');  	if (mm)  		mmput(mm);  	return 0; @@ -542,8 +539,20 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,  		size = task_statm(mm, &shared, &text, &data, &resident);  		mmput(mm);  	} -	seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", -			size, resident, shared, text, data); +	/* +	 * For quick read, open code by putting numbers directly +	 * expected format is +	 * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", +	 *               size, resident, shared, text, data); +	 */ +	seq_put_decimal_ull(m, 0, size); +	seq_put_decimal_ull(m, ' ', resident); +	seq_put_decimal_ull(m, ' ', shared); +	seq_put_decimal_ull(m, ' ', text); +	seq_put_decimal_ull(m, ' ', 0); +	seq_put_decimal_ull(m, ' ', text); +	seq_put_decimal_ull(m, ' ', 0); +	seq_putc(m, '\n');  	return 0;  } diff --git a/fs/proc/base.c b/fs/proc/base.c index 9cde9edf9c4..3b42c1418f3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -198,26 +198,6 @@ static int proc_root_link(struct dentry *dentry, struct path *path)  	return result;  } -static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) -{ -	struct mm_struct *mm; -	int err; - -	err =  mutex_lock_killable(&task->signal->cred_guard_mutex); -	if (err) -		return ERR_PTR(err); - -	mm = get_task_mm(task); -	if (mm && mm != current->mm && -			!ptrace_may_access(task, mode)) { -		mmput(mm); -		mm = ERR_PTR(-EACCES); -	} -	mutex_unlock(&task->signal->cred_guard_mutex); - -	return mm; -} -  struct mm_struct *mm_for_maps(struct task_struct *task)  {  	return mm_access(task, PTRACE_MODE_READ); @@ -711,6 +691,13 @@ static int mem_open(struct inode* inode, struct file* file)  	if (IS_ERR(mm))  		return PTR_ERR(mm); +	if (mm) { +		/* ensure this mm_struct can't be freed */ +		atomic_inc(&mm->mm_count); +		/* but do not pin its memory */ +		mmput(mm); +	} +  	/* OK to pass negative loff_t, we can catch out-of-range */  	file->f_mode |= FMODE_UNSIGNED_OFFSET;  	file->private_data = mm; @@ -718,57 +705,13 @@ static int mem_open(struct inode* inode, struct file* file)  	return 0;  } -static ssize_t mem_read(struct file * file, char __user * buf, -			size_t count, loff_t *ppos) +static ssize_t mem_rw(struct file *file, char __user *buf, +			size_t count, loff_t *ppos, int write)  { -	int ret; -	char *page; -	unsigned long src = *ppos;  	struct mm_struct *mm = file->private_data; - -	if (!mm) -		return 0; - -	page = (char *)__get_free_page(GFP_TEMPORARY); -	if (!page) -		return -ENOMEM; - -	ret = 0; -  -	while (count > 0) { -		int this_len, retval; - -		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; -		retval = access_remote_vm(mm, src, page, this_len, 0); -		if (!retval) { -			if (!ret) -				ret = -EIO; -			break; -		} - -		if (copy_to_user(buf, page, retval)) { -			ret = -EFAULT; -			break; -		} -  -		ret += retval; -		src += retval; -		buf += retval; -		count -= retval; -	} -	*ppos = src; - -	free_page((unsigned long) page); -	return ret; -} - -static ssize_t mem_write(struct file * file, const char __user *buf, -			 size_t count, loff_t *ppos) -{ -	int copied; +	unsigned long addr = *ppos; +	ssize_t copied;  	char *page; -	unsigned long dst = *ppos; -	struct mm_struct *mm = file->private_data;  	if (!mm)  		return 0; @@ -778,31 +721,54 @@ static ssize_t mem_write(struct file * file, const char __user *buf,  		return -ENOMEM;  	copied = 0; +	if (!atomic_inc_not_zero(&mm->mm_users)) +		goto free; +  	while (count > 0) { -		int this_len, retval; +		int this_len = min_t(int, count, PAGE_SIZE); -		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; -		if (copy_from_user(page, buf, this_len)) { +		if (write && copy_from_user(page, buf, this_len)) {  			copied = -EFAULT;  			break;  		} -		retval = access_remote_vm(mm, dst, page, this_len, 1); -		if (!retval) { + +		this_len = access_remote_vm(mm, addr, page, this_len, write); +		if (!this_len) {  			if (!copied)  				copied = -EIO;  			break;  		} -		copied += retval; -		buf += retval; -		dst += retval; -		count -= retval;			 + +		if (!write && copy_to_user(buf, page, this_len)) { +			copied = -EFAULT; +			break; +		} + +		buf += this_len; +		addr += this_len; +		copied += this_len; +		count -= this_len;  	} -	*ppos = dst; +	*ppos = addr; +	mmput(mm); +free:  	free_page((unsigned long) page);  	return copied;  } +static ssize_t mem_read(struct file *file, char __user *buf, +			size_t count, loff_t *ppos) +{ +	return mem_rw(file, buf, count, ppos, 0); +} + +static ssize_t mem_write(struct file *file, const char __user *buf, +			 size_t count, loff_t *ppos) +{ +	return mem_rw(file, (char __user*)buf, count, ppos, 1); +} +  loff_t mem_lseek(struct file *file, loff_t offset, int orig)  {  	switch (orig) { @@ -822,8 +788,8 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)  static int mem_release(struct inode *inode, struct file *file)  {  	struct mm_struct *mm = file->private_data; - -	mmput(mm); +	if (mm) +		mmdrop(mm);  	return 0;  } @@ -1344,8 +1310,7 @@ sched_autogroup_write(struct file *file, const char __user *buf,  	if (!p)  		return -ESRCH; -	err = nice; -	err = proc_sched_autogroup_set_nice(p, &err); +	err = proc_sched_autogroup_set_nice(p, nice);  	if (err)  		count = err; @@ -3024,9 +2989,9 @@ static const struct pid_entry tgid_base_stuff[] = {  	INF("cmdline",    S_IRUGO, proc_pid_cmdline),  	ONE("stat",       S_IRUGO, proc_tgid_stat),  	ONE("statm",      S_IRUGO, proc_pid_statm), -	REG("maps",       S_IRUGO, proc_maps_operations), +	REG("maps",       S_IRUGO, proc_pid_maps_operations),  #ifdef CONFIG_NUMA -	REG("numa_maps",  S_IRUGO, proc_numa_maps_operations), +	REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),  #endif  	REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),  	LNK("cwd",        proc_cwd_link), @@ -3037,7 +3002,7 @@ static const struct pid_entry tgid_base_stuff[] = {  	REG("mountstats", S_IRUSR, proc_mountstats_operations),  #ifdef CONFIG_PROC_PAGE_MONITOR  	REG("clear_refs", S_IWUSR, proc_clear_refs_operations), -	REG("smaps",      S_IRUGO, proc_smaps_operations), +	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),  	REG("pagemap",    S_IRUGO, proc_pagemap_operations),  #endif  #ifdef CONFIG_SECURITY @@ -3383,9 +3348,9 @@ static const struct pid_entry tid_base_stuff[] = {  	INF("cmdline",   S_IRUGO, proc_pid_cmdline),  	ONE("stat",      S_IRUGO, proc_tid_stat),  	ONE("statm",     S_IRUGO, proc_pid_statm), -	REG("maps",      S_IRUGO, proc_maps_operations), +	REG("maps",      S_IRUGO, proc_tid_maps_operations),  #ifdef CONFIG_NUMA -	REG("numa_maps", S_IRUGO, proc_numa_maps_operations), +	REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),  #endif  	REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),  	LNK("cwd",       proc_cwd_link), @@ -3395,7 +3360,7 @@ static const struct pid_entry tid_base_stuff[] = {  	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),  #ifdef CONFIG_PROC_PAGE_MONITOR  	REG("clear_refs", S_IWUSR, proc_clear_refs_operations), -	REG("smaps",     S_IRUGO, proc_smaps_operations), +	REG("smaps",     S_IRUGO, proc_tid_smaps_operations),  	REG("pagemap",    S_IRUGO, proc_pagemap_operations),  #endif  #ifdef CONFIG_SECURITY diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 84fd3235a59..8461a7b82fd 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -486,8 +486,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)  int proc_fill_super(struct super_block *s)  { -	struct inode * root_inode; -  	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;  	s->s_blocksize = 1024;  	s->s_blocksize_bits = 10; @@ -496,19 +494,11 @@ int proc_fill_super(struct super_block *s)  	s->s_time_gran = 1;  	pde_get(&proc_root); -	root_inode = proc_get_inode(s, &proc_root); -	if (!root_inode) -		goto out_no_root; -	root_inode->i_uid = 0; -	root_inode->i_gid = 0; -	s->s_root = d_alloc_root(root_inode); -	if (!s->s_root) -		goto out_no_root; -	return 0; +	s->s_root = d_make_root(proc_get_inode(s, &proc_root)); +	if (s->s_root) +		return 0; -out_no_root:  	printk("proc_read_super: get root inode failed\n"); -	iput(root_inode);  	pde_put(&proc_root);  	return -ENOMEM;  } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 292577531ad..5f79bb8b4c6 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -10,12 +10,15 @@   */  #include <linux/proc_fs.h> +struct  ctl_table_header;  extern struct proc_dir_entry proc_root;  #ifdef CONFIG_PROC_SYSCTL  extern int proc_sys_init(void); +extern void sysctl_head_put(struct ctl_table_header *head);  #else  static inline void proc_sys_init(void) { } +static inline void sysctl_head_put(struct ctl_table_header *head) { }  #endif  #ifdef CONFIG_NET  extern int proc_net_init(void); @@ -53,9 +56,12 @@ extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,  				struct pid *pid, struct task_struct *task);  extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); -extern const struct file_operations proc_maps_operations; -extern const struct file_operations proc_numa_maps_operations; -extern const struct file_operations proc_smaps_operations; +extern const struct file_operations proc_pid_maps_operations; +extern const struct file_operations proc_tid_maps_operations; +extern const struct file_operations proc_pid_numa_maps_operations; +extern const struct file_operations proc_tid_numa_maps_operations; +extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_tid_smaps_operations;  extern const struct file_operations proc_clear_refs_operations;  extern const struct file_operations proc_pagemap_operations;  extern const struct file_operations proc_net_operations; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index d245cb23dd7..86c67eee439 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -157,7 +157,8 @@ static int kcore_update_ram(void)  #ifdef CONFIG_SPARSEMEM_VMEMMAP  /* calculate vmemmap's address from given system ram pfn and register it */ -int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)  {  	unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;  	unsigned long nr_pages = ent->size >> PAGE_SHIFT; @@ -189,7 +190,8 @@ int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)  }  #else -int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)  {  	return 1;  } @@ -513,7 +515,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)  				n = copy_to_user(buffer, (char *)start, tsz);  				/* -				 * We cannot distingush between fault on source +				 * We cannot distinguish between fault on source  				 * and fault on destination. When this happens  				 * we clear too and hope it will trigger the  				 * EFAULT again. diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 27da860115c..3551f1f839e 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -53,7 +53,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,  	ei->ns_ops    = ns_ops;  	ei->ns	      = ns; -	dentry->d_op = &pid_dentry_operations; +	d_set_d_op(dentry, &pid_dentry_operations);  	d_add(dentry, inode);  	/* Close the race of the process dying before we return the dentry */  	if (pid_revalidate(dentry, NULL)) diff --git a/fs/proc/page.c b/fs/proc/page.c index 6d8e6a9e93a..7fcd0d60a96 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -115,6 +115,8 @@ u64 stable_page_flags(struct page *page)  		u |= 1 << KPF_COMPOUND_TAIL;  	if (PageHuge(page))  		u |= 1 << KPF_HUGE; +	else if (PageTransCompound(page)) +		u |= 1 << KPF_THP;  	/*  	 * Caveats on high order pages: page->_count will only be set diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a6b62173d4c..21d836f4029 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -6,7 +6,10 @@  #include <linux/poll.h>  #include <linux/proc_fs.h>  #include <linux/security.h> +#include <linux/sched.h>  #include <linux/namei.h> +#include <linux/mm.h> +#include <linux/module.h>  #include "internal.h"  static const struct dentry_operations proc_sys_dentry_operations; @@ -24,6 +27,371 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll)  	wake_up_interruptible(&poll->wait);  } +static struct ctl_table root_table[] = { +	{ +		.procname = "", +		.mode = S_IFDIR|S_IRUGO|S_IXUGO, +	}, +	{ } +}; +static struct ctl_table_root sysctl_table_root = { +	.default_set.dir.header = { +		{{.count = 1, +		  .nreg = 1, +		  .ctl_table = root_table }}, +		.ctl_table_arg = root_table, +		.root = &sysctl_table_root, +		.set = &sysctl_table_root.default_set, +	}, +}; + +static DEFINE_SPINLOCK(sysctl_lock); + +static void drop_sysctl_table(struct ctl_table_header *header); +static int sysctl_follow_link(struct ctl_table_header **phead, +	struct ctl_table **pentry, struct nsproxy *namespaces); +static int insert_links(struct ctl_table_header *head); +static void put_links(struct ctl_table_header *header); + +static void sysctl_print_dir(struct ctl_dir *dir) +{ +	if (dir->header.parent) +		sysctl_print_dir(dir->header.parent); +	printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname); +} + +static int namecmp(const char *name1, int len1, const char *name2, int len2) +{ +	int minlen; +	int cmp; + +	minlen = len1; +	if (minlen > len2) +		minlen = len2; + +	cmp = memcmp(name1, name2, minlen); +	if (cmp == 0) +		cmp = len1 - len2; +	return cmp; +} + +/* Called under sysctl_lock */ +static struct ctl_table *find_entry(struct ctl_table_header **phead, +	struct ctl_dir *dir, const char *name, int namelen) +{ +	struct ctl_table_header *head; +	struct ctl_table *entry; +	struct rb_node *node = dir->root.rb_node; + +	while (node) +	{ +		struct ctl_node *ctl_node; +		const char *procname; +		int cmp; + +		ctl_node = rb_entry(node, struct ctl_node, node); +		head = ctl_node->header; +		entry = &head->ctl_table[ctl_node - head->node]; +		procname = entry->procname; + +		cmp = namecmp(name, namelen, procname, strlen(procname)); +		if (cmp < 0) +			node = node->rb_left; +		else if (cmp > 0) +			node = node->rb_right; +		else { +			*phead = head; +			return entry; +		} +	} +	return NULL; +} + +static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ +	struct rb_node *node = &head->node[entry - head->ctl_table].node; +	struct rb_node **p = &head->parent->root.rb_node; +	struct rb_node *parent = NULL; +	const char *name = entry->procname; +	int namelen = strlen(name); + +	while (*p) { +		struct ctl_table_header *parent_head; +		struct ctl_table *parent_entry; +		struct ctl_node *parent_node; +		const char *parent_name; +		int cmp; + +		parent = *p; +		parent_node = rb_entry(parent, struct ctl_node, node); +		parent_head = parent_node->header; +		parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; +		parent_name = parent_entry->procname; + +		cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); +		if (cmp < 0) +			p = &(*p)->rb_left; +		else if (cmp > 0) +			p = &(*p)->rb_right; +		else { +			printk(KERN_ERR "sysctl duplicate entry: "); +			sysctl_print_dir(head->parent); +			printk(KERN_CONT "/%s\n", entry->procname); +			return -EEXIST; +		} +	} + +	rb_link_node(node, parent, p); +	return 0; +} + +static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ +	struct rb_node *node = &head->node[entry - head->ctl_table].node; + +	rb_erase(node, &head->parent->root); +} + +static void init_header(struct ctl_table_header *head, +	struct ctl_table_root *root, struct ctl_table_set *set, +	struct ctl_node *node, struct ctl_table *table) +{ +	head->ctl_table = table; +	head->ctl_table_arg = table; +	head->used = 0; +	head->count = 1; +	head->nreg = 1; +	head->unregistering = NULL; +	head->root = root; +	head->set = set; +	head->parent = NULL; +	head->node = node; +	if (node) { +		struct ctl_table *entry; +		for (entry = table; entry->procname; entry++, node++) { +			rb_init_node(&node->node); +			node->header = head; +		} +	} +} + +static void erase_header(struct ctl_table_header *head) +{ +	struct ctl_table *entry; +	for (entry = head->ctl_table; entry->procname; entry++) +		erase_entry(head, entry); +} + +static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) +{ +	struct ctl_table *entry; +	int err; + +	dir->header.nreg++; +	header->parent = dir; +	err = insert_links(header); +	if (err) +		goto fail_links; +	for (entry = header->ctl_table; entry->procname; entry++) { +		err = insert_entry(header, entry); +		if (err) +			goto fail; +	} +	return 0; +fail: +	erase_header(header); +	put_links(header); +fail_links: +	header->parent = NULL; +	drop_sysctl_table(&dir->header); +	return err; +} + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ +	if (unlikely(p->unregistering)) +		return 0; +	p->used++; +	return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ +	if (!--p->used) +		if (unlikely(p->unregistering)) +			complete(p->unregistering); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ +	/* +	 * if p->used is 0, nobody will ever touch that entry again; +	 * we'll eliminate all paths to it before dropping sysctl_lock +	 */ +	if (unlikely(p->used)) { +		struct completion wait; +		init_completion(&wait); +		p->unregistering = &wait; +		spin_unlock(&sysctl_lock); +		wait_for_completion(&wait); +		spin_lock(&sysctl_lock); +	} else { +		/* anything non-NULL; we'll never dereference it */ +		p->unregistering = ERR_PTR(-EINVAL); +	} +	/* +	 * do not remove from the list until nobody holds it; walking the +	 * list in do_sysctl() relies on that. +	 */ +	erase_header(p); +} + +static void sysctl_head_get(struct ctl_table_header *head) +{ +	spin_lock(&sysctl_lock); +	head->count++; +	spin_unlock(&sysctl_lock); +} + +void sysctl_head_put(struct ctl_table_header *head) +{ +	spin_lock(&sysctl_lock); +	if (!--head->count) +		kfree_rcu(head, rcu); +	spin_unlock(&sysctl_lock); +} + +static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) +{ +	if (!head) +		BUG(); +	spin_lock(&sysctl_lock); +	if (!use_table(head)) +		head = ERR_PTR(-ENOENT); +	spin_unlock(&sysctl_lock); +	return head; +} + +static void sysctl_head_finish(struct ctl_table_header *head) +{ +	if (!head) +		return; +	spin_lock(&sysctl_lock); +	unuse_table(head); +	spin_unlock(&sysctl_lock); +} + +static struct ctl_table_set * +lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +{ +	struct ctl_table_set *set = &root->default_set; +	if (root->lookup) +		set = root->lookup(root, namespaces); +	return set; +} + +static struct ctl_table *lookup_entry(struct ctl_table_header **phead, +				      struct ctl_dir *dir, +				      const char *name, int namelen) +{ +	struct ctl_table_header *head; +	struct ctl_table *entry; + +	spin_lock(&sysctl_lock); +	entry = find_entry(&head, dir, name, namelen); +	if (entry && use_table(head)) +		*phead = head; +	else +		entry = NULL; +	spin_unlock(&sysctl_lock); +	return entry; +} + +static struct ctl_node *first_usable_entry(struct rb_node *node) +{ +	struct ctl_node *ctl_node; + +	for (;node; node = rb_next(node)) { +		ctl_node = rb_entry(node, struct ctl_node, node); +		if (use_table(ctl_node->header)) +			return ctl_node; +	} +	return NULL; +} + +static void first_entry(struct ctl_dir *dir, +	struct ctl_table_header **phead, struct ctl_table **pentry) +{ +	struct ctl_table_header *head = NULL; +	struct ctl_table *entry = NULL; +	struct ctl_node *ctl_node; + +	spin_lock(&sysctl_lock); +	ctl_node = first_usable_entry(rb_first(&dir->root)); +	spin_unlock(&sysctl_lock); +	if (ctl_node) { +		head = ctl_node->header; +		entry = &head->ctl_table[ctl_node - head->node]; +	} +	*phead = head; +	*pentry = entry; +} + +static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) +{ +	struct ctl_table_header *head = *phead; +	struct ctl_table *entry = *pentry; +	struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; + +	spin_lock(&sysctl_lock); +	unuse_table(head); + +	ctl_node = first_usable_entry(rb_next(&ctl_node->node)); +	spin_unlock(&sysctl_lock); +	head = NULL; +	if (ctl_node) { +		head = ctl_node->header; +		entry = &head->ctl_table[ctl_node - head->node]; +	} +	*phead = head; +	*pentry = entry; +} + +void register_sysctl_root(struct ctl_table_root *root) +{ +} + +/* + * sysctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ +	if (!current_euid()) +		mode >>= 6; +	else if (in_egroup_p(0)) +		mode >>= 3; +	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) +		return 0; +	return -EACCES; +} + +static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) +{ +	int mode; + +	if (root->permissions) +		mode = root->permissions(root, current->nsproxy, table); +	else +		mode = table->mode; + +	return test_perm(mode, op); +} +  static struct inode *proc_sys_make_inode(struct super_block *sb,  		struct ctl_table_header *head, struct ctl_table *table)  { @@ -43,13 +411,12 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,  	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;  	inode->i_mode = table->mode; -	if (!table->child) { +	if (!S_ISDIR(table->mode)) {  		inode->i_mode |= S_IFREG;  		inode->i_op = &proc_sys_inode_operations;  		inode->i_fop = &proc_sys_file_operations;  	} else {  		inode->i_mode |= S_IFDIR; -		clear_nlink(inode);  		inode->i_op = &proc_sys_dir_operations;  		inode->i_fop = &proc_sys_dir_file_operations;  	} @@ -57,70 +424,42 @@ out:  	return inode;  } -static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name) -{ -	int len; -	for ( ; p->procname; p++) { - -		if (!p->procname) -			continue; - -		len = strlen(p->procname); -		if (len != name->len) -			continue; - -		if (memcmp(p->procname, name->name, len) != 0) -			continue; - -		/* I have a match */ -		return p; -	} -	return NULL; -} -  static struct ctl_table_header *grab_header(struct inode *inode)  { -	if (PROC_I(inode)->sysctl) -		return sysctl_head_grab(PROC_I(inode)->sysctl); -	else -		return sysctl_head_next(NULL); +	struct ctl_table_header *head = PROC_I(inode)->sysctl; +	if (!head) +		head = &sysctl_table_root.default_set.dir.header; +	return sysctl_head_grab(head);  }  static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,  					struct nameidata *nd)  {  	struct ctl_table_header *head = grab_header(dir); -	struct ctl_table *table = PROC_I(dir)->sysctl_entry;  	struct ctl_table_header *h = NULL;  	struct qstr *name = &dentry->d_name;  	struct ctl_table *p;  	struct inode *inode;  	struct dentry *err = ERR_PTR(-ENOENT); +	struct ctl_dir *ctl_dir; +	int ret;  	if (IS_ERR(head))  		return ERR_CAST(head); -	if (table && !table->child) { -		WARN_ON(1); -		goto out; -	} - -	table = table ? table->child : head->ctl_table; - -	p = find_in_table(table, name); -	if (!p) { -		for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { -			if (h->attached_to != table) -				continue; -			p = find_in_table(h->attached_by, name); -			if (p) -				break; -		} -	} +	ctl_dir = container_of(head, struct ctl_dir, header); +	p = lookup_entry(&h, ctl_dir, name->name, name->len);  	if (!p)  		goto out; +	if (S_ISLNK(p->mode)) { +		ret = sysctl_follow_link(&h, &p, current->nsproxy); +		err = ERR_PTR(ret); +		if (ret) +			goto out; +	} +  	err = ERR_PTR(-ENOMEM);  	inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);  	if (h) @@ -188,20 +527,32 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,  static int proc_sys_open(struct inode *inode, struct file *filp)  { +	struct ctl_table_header *head = grab_header(inode);  	struct ctl_table *table = PROC_I(inode)->sysctl_entry; +	/* sysctl was unregistered */ +	if (IS_ERR(head)) +		return PTR_ERR(head); +  	if (table->poll)  		filp->private_data = proc_sys_poll_event(table->poll); +	sysctl_head_finish(head); +  	return 0;  }  static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)  {  	struct inode *inode = filp->f_path.dentry->d_inode; +	struct ctl_table_header *head = grab_header(inode);  	struct ctl_table *table = PROC_I(inode)->sysctl_entry; -	unsigned long event = (unsigned long)filp->private_data;  	unsigned int ret = DEFAULT_POLLMASK; +	unsigned long event; + +	/* sysctl was unregistered */ +	if (IS_ERR(head)) +		return POLLERR | POLLHUP;  	if (!table->proc_handler)  		goto out; @@ -209,6 +560,7 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)  	if (!table->poll)  		goto out; +	event = (unsigned long)filp->private_data;  	poll_wait(filp, &table->poll->wait, wait);  	if (event != atomic_read(&table->poll->event)) { @@ -217,6 +569,8 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)  	}  out: +	sysctl_head_finish(head); +  	return ret;  } @@ -258,28 +612,45 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,  	return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);  } +static int proc_sys_link_fill_cache(struct file *filp, void *dirent, +				    filldir_t filldir, +				    struct ctl_table_header *head, +				    struct ctl_table *table) +{ +	int err, ret = 0; +	head = sysctl_head_grab(head); + +	if (S_ISLNK(table->mode)) { +		/* It is not an error if we can not follow the link ignore it */ +		err = sysctl_follow_link(&head, &table, current->nsproxy); +		if (err) +			goto out; +	} + +	ret = proc_sys_fill_cache(filp, dirent, filldir, head, table); +out: +	sysctl_head_finish(head); +	return ret; +} +  static int scan(struct ctl_table_header *head, ctl_table *table,  		unsigned long *pos, struct file *file,  		void *dirent, filldir_t filldir)  { +	int res; -	for (; table->procname; table++, (*pos)++) { -		int res; - -		/* Can't do anything without a proc name */ -		if (!table->procname) -			continue; - -		if (*pos < file->f_pos) -			continue; +	if ((*pos)++ < file->f_pos) +		return 0; +	if (unlikely(S_ISLNK(table->mode))) +		res = proc_sys_link_fill_cache(file, dirent, filldir, head, table); +	else  		res = proc_sys_fill_cache(file, dirent, filldir, head, table); -		if (res) -			return res; -		file->f_pos = *pos + 1; -	} -	return 0; +	if (res == 0) +		file->f_pos = *pos; + +	return res;  }  static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) @@ -287,20 +658,16 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)  	struct dentry *dentry = filp->f_path.dentry;  	struct inode *inode = dentry->d_inode;  	struct ctl_table_header *head = grab_header(inode); -	struct ctl_table *table = PROC_I(inode)->sysctl_entry;  	struct ctl_table_header *h = NULL; +	struct ctl_table *entry; +	struct ctl_dir *ctl_dir;  	unsigned long pos;  	int ret = -EINVAL;  	if (IS_ERR(head))  		return PTR_ERR(head); -	if (table && !table->child) { -		WARN_ON(1); -		goto out; -	} - -	table = table ? table->child : head->ctl_table; +	ctl_dir = container_of(head, struct ctl_dir, header);  	ret = 0;  	/* Avoid a switch here: arm builds fail with missing __cmpdi2 */ @@ -318,14 +685,8 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)  	}  	pos = 2; -	ret = scan(head, table, &pos, filp, dirent, filldir); -	if (ret) -		goto out; - -	for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { -		if (h->attached_to != table) -			continue; -		ret = scan(h, h->attached_by, &pos, filp, dirent, filldir); +	for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { +		ret = scan(h, entry, &pos, filp, dirent, filldir);  		if (ret) {  			sysctl_head_finish(h);  			break; @@ -445,6 +806,21 @@ static int proc_sys_delete(const struct dentry *dentry)  	return !!PROC_I(dentry->d_inode)->sysctl->unregistering;  } +static int sysctl_is_seen(struct ctl_table_header *p) +{ +	struct ctl_table_set *set = p->set; +	int res; +	spin_lock(&sysctl_lock); +	if (p->unregistering) +		res = 0; +	else if (!set->is_seen) +		res = 1; +	else +		res = set->is_seen(set); +	spin_unlock(&sysctl_lock); +	return res; +} +  static int proc_sys_compare(const struct dentry *parent,  		const struct inode *pinode,  		const struct dentry *dentry, const struct inode *inode, @@ -470,6 +846,753 @@ static const struct dentry_operations proc_sys_dentry_operations = {  	.d_compare	= proc_sys_compare,  }; +static struct ctl_dir *find_subdir(struct ctl_dir *dir, +				   const char *name, int namelen) +{ +	struct ctl_table_header *head; +	struct ctl_table *entry; + +	entry = find_entry(&head, dir, name, namelen); +	if (!entry) +		return ERR_PTR(-ENOENT); +	if (!S_ISDIR(entry->mode)) +		return ERR_PTR(-ENOTDIR); +	return container_of(head, struct ctl_dir, header); +} + +static struct ctl_dir *new_dir(struct ctl_table_set *set, +			       const char *name, int namelen) +{ +	struct ctl_table *table; +	struct ctl_dir *new; +	struct ctl_node *node; +	char *new_name; + +	new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + +		      sizeof(struct ctl_table)*2 +  namelen + 1, +		      GFP_KERNEL); +	if (!new) +		return NULL; + +	node = (struct ctl_node *)(new + 1); +	table = (struct ctl_table *)(node + 1); +	new_name = (char *)(table + 2); +	memcpy(new_name, name, namelen); +	new_name[namelen] = '\0'; +	table[0].procname = new_name; +	table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; +	init_header(&new->header, set->dir.header.root, set, node, table); + +	return new; +} + +/** + * get_subdir - find or create a subdir with the specified name. + * @dir:  Directory to create the subdirectory in + * @name: The name of the subdirectory to find or create + * @namelen: The length of name + * + * Takes a directory with an elevated reference count so we know that + * if we drop the lock the directory will not go away.  Upon success + * the reference is moved from @dir to the returned subdirectory. + * Upon error an error code is returned and the reference on @dir is + * simply dropped. + */ +static struct ctl_dir *get_subdir(struct ctl_dir *dir, +				  const char *name, int namelen) +{ +	struct ctl_table_set *set = dir->header.set; +	struct ctl_dir *subdir, *new = NULL; +	int err; + +	spin_lock(&sysctl_lock); +	subdir = find_subdir(dir, name, namelen); +	if (!IS_ERR(subdir)) +		goto found; +	if (PTR_ERR(subdir) != -ENOENT) +		goto failed; + +	spin_unlock(&sysctl_lock); +	new = new_dir(set, name, namelen); +	spin_lock(&sysctl_lock); +	subdir = ERR_PTR(-ENOMEM); +	if (!new) +		goto failed; + +	/* Was the subdir added while we dropped the lock? */ +	subdir = find_subdir(dir, name, namelen); +	if (!IS_ERR(subdir)) +		goto found; +	if (PTR_ERR(subdir) != -ENOENT) +		goto failed; + +	/* Nope.  Use the our freshly made directory entry. */ +	err = insert_header(dir, &new->header); +	subdir = ERR_PTR(err); +	if (err) +		goto failed; +	subdir = new; +found: +	subdir->header.nreg++; +failed: +	if (unlikely(IS_ERR(subdir))) { +		printk(KERN_ERR "sysctl could not get directory: "); +		sysctl_print_dir(dir); +		printk(KERN_CONT "/%*.*s %ld\n", +			namelen, namelen, name, PTR_ERR(subdir)); +	} +	drop_sysctl_table(&dir->header); +	if (new) +		drop_sysctl_table(&new->header); +	spin_unlock(&sysctl_lock); +	return subdir; +} + +static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) +{ +	struct ctl_dir *parent; +	const char *procname; +	if (!dir->header.parent) +		return &set->dir; +	parent = xlate_dir(set, dir->header.parent); +	if (IS_ERR(parent)) +		return parent; +	procname = dir->header.ctl_table[0].procname; +	return find_subdir(parent, procname, strlen(procname)); +} + +static int sysctl_follow_link(struct ctl_table_header **phead, +	struct ctl_table **pentry, struct nsproxy *namespaces) +{ +	struct ctl_table_header *head; +	struct ctl_table_root *root; +	struct ctl_table_set *set; +	struct ctl_table *entry; +	struct ctl_dir *dir; +	int ret; + +	ret = 0; +	spin_lock(&sysctl_lock); +	root = (*pentry)->data; +	set = lookup_header_set(root, namespaces); +	dir = xlate_dir(set, (*phead)->parent); +	if (IS_ERR(dir)) +		ret = PTR_ERR(dir); +	else { +		const char *procname = (*pentry)->procname; +		head = NULL; +		entry = find_entry(&head, dir, procname, strlen(procname)); +		ret = -ENOENT; +		if (entry && use_table(head)) { +			unuse_table(*phead); +			*phead = head; +			*pentry = entry; +			ret = 0; +		} +	} + +	spin_unlock(&sysctl_lock); +	return ret; +} + +static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) +{ +	struct va_format vaf; +	va_list args; + +	va_start(args, fmt); +	vaf.fmt = fmt; +	vaf.va = &args; + +	printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n", +		path, table->procname, &vaf); + +	va_end(args); +	return -EINVAL; +} + +static int sysctl_check_table(const char *path, struct ctl_table *table) +{ +	int err = 0; +	for (; table->procname; table++) { +		if (table->child) +			err = sysctl_err(path, table, "Not a file"); + +		if ((table->proc_handler == proc_dostring) || +		    (table->proc_handler == proc_dointvec) || +		    (table->proc_handler == proc_dointvec_minmax) || +		    (table->proc_handler == proc_dointvec_jiffies) || +		    (table->proc_handler == proc_dointvec_userhz_jiffies) || +		    (table->proc_handler == proc_dointvec_ms_jiffies) || +		    (table->proc_handler == proc_doulongvec_minmax) || +		    (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { +			if (!table->data) +				err = sysctl_err(path, table, "No data"); +			if (!table->maxlen) +				err = sysctl_err(path, table, "No maxlen"); +		} +		if (!table->proc_handler) +			err = sysctl_err(path, table, "No proc_handler"); + +		if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode) +			err = sysctl_err(path, table, "bogus .mode 0%o", +				table->mode); +	} +	return err; +} + +static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, +	struct ctl_table_root *link_root) +{ +	struct ctl_table *link_table, *entry, *link; +	struct ctl_table_header *links; +	struct ctl_node *node; +	char *link_name; +	int nr_entries, name_bytes; + +	name_bytes = 0; +	nr_entries = 0; +	for (entry = table; entry->procname; entry++) { +		nr_entries++; +		name_bytes += strlen(entry->procname) + 1; +	} + +	links = kzalloc(sizeof(struct ctl_table_header) + +			sizeof(struct ctl_node)*nr_entries + +			sizeof(struct ctl_table)*(nr_entries + 1) + +			name_bytes, +			GFP_KERNEL); + +	if (!links) +		return NULL; + +	node = (struct ctl_node *)(links + 1); +	link_table = (struct ctl_table *)(node + nr_entries); +	link_name = (char *)&link_table[nr_entries + 1]; + +	for (link = link_table, entry = table; entry->procname; link++, entry++) { +		int len = strlen(entry->procname) + 1; +		memcpy(link_name, entry->procname, len); +		link->procname = link_name; +		link->mode = S_IFLNK|S_IRWXUGO; +		link->data = link_root; +		link_name += len; +	} +	init_header(links, dir->header.root, dir->header.set, node, link_table); +	links->nreg = nr_entries; + +	return links; +} + +static bool get_links(struct ctl_dir *dir, +	struct ctl_table *table, struct ctl_table_root *link_root) +{ +	struct ctl_table_header *head; +	struct ctl_table *entry, *link; + +	/* Are there links available for every entry in table? */ +	for (entry = table; entry->procname; entry++) { +		const char *procname = entry->procname; +		link = find_entry(&head, dir, procname, strlen(procname)); +		if (!link) +			return false; +		if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) +			continue; +		if (S_ISLNK(link->mode) && (link->data == link_root)) +			continue; +		return false; +	} + +	/* The checks passed.  Increase the registration count on the links */ +	for (entry = table; entry->procname; entry++) { +		const char *procname = entry->procname; +		link = find_entry(&head, dir, procname, strlen(procname)); +		head->nreg++; +	} +	return true; +} + +static int insert_links(struct ctl_table_header *head) +{ +	struct ctl_table_set *root_set = &sysctl_table_root.default_set; +	struct ctl_dir *core_parent = NULL; +	struct ctl_table_header *links; +	int err; + +	if (head->set == root_set) +		return 0; + +	core_parent = xlate_dir(root_set, head->parent); +	if (IS_ERR(core_parent)) +		return 0; + +	if (get_links(core_parent, head->ctl_table, head->root)) +		return 0; + +	core_parent->header.nreg++; +	spin_unlock(&sysctl_lock); + +	links = new_links(core_parent, head->ctl_table, head->root); + +	spin_lock(&sysctl_lock); +	err = -ENOMEM; +	if (!links) +		goto out; + +	err = 0; +	if (get_links(core_parent, head->ctl_table, head->root)) { +		kfree(links); +		goto out; +	} + +	err = insert_header(core_parent, links); +	if (err) +		kfree(links); +out: +	drop_sysctl_table(&core_parent->header); +	return err; +} + +/** + * __register_sysctl_table - register a leaf sysctl table + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * The members of the &struct ctl_table structure are used as follows: + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + *            enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file + * + * child - must be %NULL. + * + * proc_handler - the text handler routine (described below) + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * There must be a proc_handler routine for any terminal nodes. + * Several default handlers are available to cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *__register_sysctl_table( +	struct ctl_table_set *set, +	const char *path, struct ctl_table *table) +{ +	struct ctl_table_root *root = set->dir.header.root; +	struct ctl_table_header *header; +	const char *name, *nextname; +	struct ctl_dir *dir; +	struct ctl_table *entry; +	struct ctl_node *node; +	int nr_entries = 0; + +	for (entry = table; entry->procname; entry++) +		nr_entries++; + +	header = kzalloc(sizeof(struct ctl_table_header) + +			 sizeof(struct ctl_node)*nr_entries, GFP_KERNEL); +	if (!header) +		return NULL; + +	node = (struct ctl_node *)(header + 1); +	init_header(header, root, set, node, table); +	if (sysctl_check_table(path, table)) +		goto fail; + +	spin_lock(&sysctl_lock); +	dir = &set->dir; +	/* Reference moved down the diretory tree get_subdir */ +	dir->header.nreg++; +	spin_unlock(&sysctl_lock); + +	/* Find the directory for the ctl_table */ +	for (name = path; name; name = nextname) { +		int namelen; +		nextname = strchr(name, '/'); +		if (nextname) { +			namelen = nextname - name; +			nextname++; +		} else { +			namelen = strlen(name); +		} +		if (namelen == 0) +			continue; + +		dir = get_subdir(dir, name, namelen); +		if (IS_ERR(dir)) +			goto fail; +	} + +	spin_lock(&sysctl_lock); +	if (insert_header(dir, header)) +		goto fail_put_dir_locked; + +	drop_sysctl_table(&dir->header); +	spin_unlock(&sysctl_lock); + +	return header; + +fail_put_dir_locked: +	drop_sysctl_table(&dir->header); +	spin_unlock(&sysctl_lock); +fail: +	kfree(header); +	dump_stack(); +	return NULL; +} + +/** + * register_sysctl - register a sysctl table + * @path: The path to the directory the sysctl table is in. + * @table: the table structure + * + * Register a sysctl table. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) +{ +	return __register_sysctl_table(&sysctl_table_root.default_set, +					path, table); +} +EXPORT_SYMBOL(register_sysctl); + +static char *append_path(const char *path, char *pos, const char *name) +{ +	int namelen; +	namelen = strlen(name); +	if (((pos - path) + namelen + 2) >= PATH_MAX) +		return NULL; +	memcpy(pos, name, namelen); +	pos[namelen] = '/'; +	pos[namelen + 1] = '\0'; +	pos += namelen + 1; +	return pos; +} + +static int count_subheaders(struct ctl_table *table) +{ +	int has_files = 0; +	int nr_subheaders = 0; +	struct ctl_table *entry; + +	/* special case: no directory and empty directory */ +	if (!table || !table->procname) +		return 1; + +	for (entry = table; entry->procname; entry++) { +		if (entry->child) +			nr_subheaders += count_subheaders(entry->child); +		else +			has_files = 1; +	} +	return nr_subheaders + has_files; +} + +static int register_leaf_sysctl_tables(const char *path, char *pos, +	struct ctl_table_header ***subheader, struct ctl_table_set *set, +	struct ctl_table *table) +{ +	struct ctl_table *ctl_table_arg = NULL; +	struct ctl_table *entry, *files; +	int nr_files = 0; +	int nr_dirs = 0; +	int err = -ENOMEM; + +	for (entry = table; entry->procname; entry++) { +		if (entry->child) +			nr_dirs++; +		else +			nr_files++; +	} + +	files = table; +	/* If there are mixed files and directories we need a new table */ +	if (nr_dirs && nr_files) { +		struct ctl_table *new; +		files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1), +				GFP_KERNEL); +		if (!files) +			goto out; + +		ctl_table_arg = files; +		for (new = files, entry = table; entry->procname; entry++) { +			if (entry->child) +				continue; +			*new = *entry; +			new++; +		} +	} + +	/* Register everything except a directory full of subdirectories */ +	if (nr_files || !nr_dirs) { +		struct ctl_table_header *header; +		header = __register_sysctl_table(set, path, files); +		if (!header) { +			kfree(ctl_table_arg); +			goto out; +		} + +		/* Remember if we need to free the file table */ +		header->ctl_table_arg = ctl_table_arg; +		**subheader = header; +		(*subheader)++; +	} + +	/* Recurse into the subdirectories. */ +	for (entry = table; entry->procname; entry++) { +		char *child_pos; + +		if (!entry->child) +			continue; + +		err = -ENAMETOOLONG; +		child_pos = append_path(path, pos, entry->procname); +		if (!child_pos) +			goto out; + +		err = register_leaf_sysctl_tables(path, child_pos, subheader, +						  set, entry->child); +		pos[0] = '\0'; +		if (err) +			goto out; +	} +	err = 0; +out: +	/* On failure our caller will unregister all registered subheaders */ +	return err; +} + +/** + * __register_sysctl_paths - register a sysctl table hierarchy + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *__register_sysctl_paths( +	struct ctl_table_set *set, +	const struct ctl_path *path, struct ctl_table *table) +{ +	struct ctl_table *ctl_table_arg = table; +	int nr_subheaders = count_subheaders(table); +	struct ctl_table_header *header = NULL, **subheaders, **subheader; +	const struct ctl_path *component; +	char *new_path, *pos; + +	pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); +	if (!new_path) +		return NULL; + +	pos[0] = '\0'; +	for (component = path; component->procname; component++) { +		pos = append_path(new_path, pos, component->procname); +		if (!pos) +			goto out; +	} +	while (table->procname && table->child && !table[1].procname) { +		pos = append_path(new_path, pos, table->procname); +		if (!pos) +			goto out; +		table = table->child; +	} +	if (nr_subheaders == 1) { +		header = __register_sysctl_table(set, new_path, table); +		if (header) +			header->ctl_table_arg = ctl_table_arg; +	} else { +		header = kzalloc(sizeof(*header) + +				 sizeof(*subheaders)*nr_subheaders, GFP_KERNEL); +		if (!header) +			goto out; + +		subheaders = (struct ctl_table_header **) (header + 1); +		subheader = subheaders; +		header->ctl_table_arg = ctl_table_arg; + +		if (register_leaf_sysctl_tables(new_path, pos, &subheader, +						set, table)) +			goto err_register_leaves; +	} + +out: +	kfree(new_path); +	return header; + +err_register_leaves: +	while (subheader > subheaders) { +		struct ctl_table_header *subh = *(--subheader); +		struct ctl_table *table = subh->ctl_table_arg; +		unregister_sysctl_table(subh); +		kfree(table); +	} +	kfree(header); +	header = NULL; +	goto out; +} + +/** + * register_sysctl_table_path - register a sysctl table hierarchy + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, +						struct ctl_table *table) +{ +	return __register_sysctl_paths(&sysctl_table_root.default_set, +					path, table); +} +EXPORT_SYMBOL(register_sysctl_paths); + +/** + * register_sysctl_table - register a sysctl table hierarchy + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_table(struct ctl_table *table) +{ +	static const struct ctl_path null_path[] = { {} }; + +	return register_sysctl_paths(null_path, table); +} +EXPORT_SYMBOL(register_sysctl_table); + +static void put_links(struct ctl_table_header *header) +{ +	struct ctl_table_set *root_set = &sysctl_table_root.default_set; +	struct ctl_table_root *root = header->root; +	struct ctl_dir *parent = header->parent; +	struct ctl_dir *core_parent; +	struct ctl_table *entry; + +	if (header->set == root_set) +		return; + +	core_parent = xlate_dir(root_set, parent); +	if (IS_ERR(core_parent)) +		return; + +	for (entry = header->ctl_table; entry->procname; entry++) { +		struct ctl_table_header *link_head; +		struct ctl_table *link; +		const char *name = entry->procname; + +		link = find_entry(&link_head, core_parent, name, strlen(name)); +		if (link && +		    ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || +		     (S_ISLNK(link->mode) && (link->data == root)))) { +			drop_sysctl_table(link_head); +		} +		else { +			printk(KERN_ERR "sysctl link missing during unregister: "); +			sysctl_print_dir(parent); +			printk(KERN_CONT "/%s\n", name); +		} +	} +} + +static void drop_sysctl_table(struct ctl_table_header *header) +{ +	struct ctl_dir *parent = header->parent; + +	if (--header->nreg) +		return; + +	put_links(header); +	start_unregistering(header); +	if (!--header->count) +		kfree_rcu(header, rcu); + +	if (parent) +		drop_sysctl_table(&parent->header); +} + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ +	int nr_subheaders; +	might_sleep(); + +	if (header == NULL) +		return; + +	nr_subheaders = count_subheaders(header->ctl_table_arg); +	if (unlikely(nr_subheaders > 1)) { +		struct ctl_table_header **subheaders; +		int i; + +		subheaders = (struct ctl_table_header **)(header + 1); +		for (i = nr_subheaders -1; i >= 0; i--) { +			struct ctl_table_header *subh = subheaders[i]; +			struct ctl_table *table = subh->ctl_table_arg; +			unregister_sysctl_table(subh); +			kfree(table); +		} +		kfree(header); +		return; +	} + +	spin_lock(&sysctl_lock); +	drop_sysctl_table(header); +	spin_unlock(&sysctl_lock); +} +EXPORT_SYMBOL(unregister_sysctl_table); + +void setup_sysctl_set(struct ctl_table_set *set, +	struct ctl_table_root *root, +	int (*is_seen)(struct ctl_table_set *)) +{ +	memset(set, 0, sizeof(*set)); +	set->is_seen = is_seen; +	init_header(&set->dir.header, root, set, NULL, root_table); +} + +void retire_sysctl_set(struct ctl_table_set *set) +{ +	WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); +} +  int __init proc_sys_init(void)  {  	struct proc_dir_entry *proc_sys_root; @@ -478,5 +1601,6 @@ int __init proc_sys_init(void)  	proc_sys_root->proc_iops = &proc_sys_dir_operations;  	proc_sys_root->proc_fops = &proc_sys_dir_file_operations;  	proc_sys_root->nlink = 0; -	return 0; + +	return sysctl_init();  } diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 121f77cfef7..6a0c62d6e44 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -89,18 +89,19 @@ static int show_stat(struct seq_file *p, void *v)  	}  	sum += arch_irq_stat(); -	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu " -		"%llu\n", -		(unsigned long long)cputime64_to_clock_t(user), -		(unsigned long long)cputime64_to_clock_t(nice), -		(unsigned long long)cputime64_to_clock_t(system), -		(unsigned long long)cputime64_to_clock_t(idle), -		(unsigned long long)cputime64_to_clock_t(iowait), -		(unsigned long long)cputime64_to_clock_t(irq), -		(unsigned long long)cputime64_to_clock_t(softirq), -		(unsigned long long)cputime64_to_clock_t(steal), -		(unsigned long long)cputime64_to_clock_t(guest), -		(unsigned long long)cputime64_to_clock_t(guest_nice)); +	seq_puts(p, "cpu "); +	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); +	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); +	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); +	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); +	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); +	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); +	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); +	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); +	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); +	seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); +	seq_putc(p, '\n'); +  	for_each_online_cpu(i) {  		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */  		user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; @@ -113,26 +114,24 @@ static int show_stat(struct seq_file *p, void *v)  		steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];  		guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];  		guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; -		seq_printf(p, -			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " -			"%llu\n", -			i, -			(unsigned long long)cputime64_to_clock_t(user), -			(unsigned long long)cputime64_to_clock_t(nice), -			(unsigned long long)cputime64_to_clock_t(system), -			(unsigned long long)cputime64_to_clock_t(idle), -			(unsigned long long)cputime64_to_clock_t(iowait), -			(unsigned long long)cputime64_to_clock_t(irq), -			(unsigned long long)cputime64_to_clock_t(softirq), -			(unsigned long long)cputime64_to_clock_t(steal), -			(unsigned long long)cputime64_to_clock_t(guest), -			(unsigned long long)cputime64_to_clock_t(guest_nice)); +		seq_printf(p, "cpu%d", i); +		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); +		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); +		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); +		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); +		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); +		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); +		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); +		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); +		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); +		seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); +		seq_putc(p, '\n');  	}  	seq_printf(p, "intr %llu", (unsigned long long)sum);  	/* sum again ? it could be updated? */  	for_each_irq_nr(j) -		seq_printf(p, " %u", kstat_irqs(j)); +		seq_put_decimal_ull(p, ' ', kstat_irqs(j));  	seq_printf(p,  		"\nctxt %llu\n" @@ -149,7 +148,7 @@ static int show_stat(struct seq_file *p, void *v)  	seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);  	for (i = 0; i < NR_SOFTIRQS; i++) -		seq_printf(p, " %u", per_softirq_sums[i]); +		seq_put_decimal_ull(p, ' ', per_softirq_sums[i]);  	seq_putc(p, '\n');  	return 0; @@ -157,11 +156,14 @@ static int show_stat(struct seq_file *p, void *v)  static int stat_open(struct inode *inode, struct file *file)  { -	unsigned size = 4096 * (1 + num_possible_cpus() / 32); +	unsigned size = 1024 + 128 * num_possible_cpus();  	char *buf;  	struct seq_file *m;  	int res; +	/* minimum size to display an interrupt count : 2 bytes */ +	size += 2 * nr_irqs; +  	/* don't ask for more than the kmalloc() max size */  	if (size > KMALLOC_MAX_SIZE)  		size = KMALLOC_MAX_SIZE; @@ -173,7 +175,7 @@ static int stat_open(struct inode *inode, struct file *file)  	if (!res) {  		m = file->private_data;  		m->buf = buf; -		m->size = size; +		m->size = ksize(buf);  	} else  		kfree(buf);  	return res; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e418c5abdb0..9694cc28351 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -209,16 +209,20 @@ static int do_maps_open(struct inode *inode, struct file *file,  	return ret;  } -static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) +static void +show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)  {  	struct mm_struct *mm = vma->vm_mm;  	struct file *file = vma->vm_file; +	struct proc_maps_private *priv = m->private; +	struct task_struct *task = priv->task;  	vm_flags_t flags = vma->vm_flags;  	unsigned long ino = 0;  	unsigned long long pgoff = 0;  	unsigned long start, end;  	dev_t dev = 0;  	int len; +	const char *name = NULL;  	if (file) {  		struct inode *inode = vma->vm_file->f_path.dentry->d_inode; @@ -252,36 +256,57 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)  	if (file) {  		pad_len_spaces(m, len);  		seq_path(m, &file->f_path, "\n"); -	} else { -		const char *name = arch_vma_name(vma); -		if (!name) { -			if (mm) { -				if (vma->vm_start <= mm->brk && -						vma->vm_end >= mm->start_brk) { -					name = "[heap]"; -				} else if (vma->vm_start <= mm->start_stack && -					   vma->vm_end >= mm->start_stack) { -					name = "[stack]"; -				} +		goto done; +	} + +	name = arch_vma_name(vma); +	if (!name) { +		pid_t tid; + +		if (!mm) { +			name = "[vdso]"; +			goto done; +		} + +		if (vma->vm_start <= mm->brk && +		    vma->vm_end >= mm->start_brk) { +			name = "[heap]"; +			goto done; +		} + +		tid = vm_is_stack(task, vma, is_pid); + +		if (tid != 0) { +			/* +			 * Thread stack in /proc/PID/task/TID/maps or +			 * the main process stack. +			 */ +			if (!is_pid || (vma->vm_start <= mm->start_stack && +			    vma->vm_end >= mm->start_stack)) { +				name = "[stack]";  			} else { -				name = "[vdso]"; +				/* Thread stack in /proc/PID/maps */ +				pad_len_spaces(m, len); +				seq_printf(m, "[stack:%d]", tid);  			}  		} -		if (name) { -			pad_len_spaces(m, len); -			seq_puts(m, name); -		} +	} + +done: +	if (name) { +		pad_len_spaces(m, len); +		seq_puts(m, name);  	}  	seq_putc(m, '\n');  } -static int show_map(struct seq_file *m, void *v) +static int show_map(struct seq_file *m, void *v, int is_pid)  {  	struct vm_area_struct *vma = v;  	struct proc_maps_private *priv = m->private;  	struct task_struct *task = priv->task; -	show_map_vma(m, vma); +	show_map_vma(m, vma, is_pid);  	if (m->count < m->size)  /* vma is copied successfully */  		m->version = (vma != get_gate_vma(task->mm)) @@ -289,20 +314,49 @@ static int show_map(struct seq_file *m, void *v)  	return 0;  } +static int show_pid_map(struct seq_file *m, void *v) +{ +	return show_map(m, v, 1); +} + +static int show_tid_map(struct seq_file *m, void *v) +{ +	return show_map(m, v, 0); +} +  static const struct seq_operations proc_pid_maps_op = {  	.start	= m_start,  	.next	= m_next,  	.stop	= m_stop, -	.show	= show_map +	.show	= show_pid_map  }; -static int maps_open(struct inode *inode, struct file *file) +static const struct seq_operations proc_tid_maps_op = { +	.start	= m_start, +	.next	= m_next, +	.stop	= m_stop, +	.show	= show_tid_map +}; + +static int pid_maps_open(struct inode *inode, struct file *file)  {  	return do_maps_open(inode, file, &proc_pid_maps_op);  } -const struct file_operations proc_maps_operations = { -	.open		= maps_open, +static int tid_maps_open(struct inode *inode, struct file *file) +{ +	return do_maps_open(inode, file, &proc_tid_maps_op); +} + +const struct file_operations proc_pid_maps_operations = { +	.open		= pid_maps_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release_private, +}; + +const struct file_operations proc_tid_maps_operations = { +	.open		= tid_maps_open,  	.read		= seq_read,  	.llseek		= seq_lseek,  	.release	= seq_release_private, @@ -394,21 +448,15 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,  	pte_t *pte;  	spinlock_t *ptl; -	spin_lock(&walk->mm->page_table_lock); -	if (pmd_trans_huge(*pmd)) { -		if (pmd_trans_splitting(*pmd)) { -			spin_unlock(&walk->mm->page_table_lock); -			wait_split_huge_page(vma->anon_vma, pmd); -		} else { -			smaps_pte_entry(*(pte_t *)pmd, addr, -					HPAGE_PMD_SIZE, walk); -			spin_unlock(&walk->mm->page_table_lock); -			mss->anonymous_thp += HPAGE_PMD_SIZE; -			return 0; -		} -	} else { +	if (pmd_trans_huge_lock(pmd, vma) == 1) { +		smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);  		spin_unlock(&walk->mm->page_table_lock); +		mss->anonymous_thp += HPAGE_PMD_SIZE; +		return 0;  	} + +	if (pmd_trans_unstable(pmd)) +		return 0;  	/*  	 * The mmap_sem held all the way back in m_start() is what  	 * keeps khugepaged out of here and from collapsing things @@ -422,7 +470,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,  	return 0;  } -static int show_smap(struct seq_file *m, void *v) +static int show_smap(struct seq_file *m, void *v, int is_pid)  {  	struct proc_maps_private *priv = m->private;  	struct task_struct *task = priv->task; @@ -440,7 +488,7 @@ static int show_smap(struct seq_file *m, void *v)  	if (vma->vm_mm && !is_vm_hugetlb_page(vma))  		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); -	show_map_vma(m, vma); +	show_map_vma(m, vma, is_pid);  	seq_printf(m,  		   "Size:           %8lu kB\n" @@ -479,20 +527,49 @@ static int show_smap(struct seq_file *m, void *v)  	return 0;  } +static int show_pid_smap(struct seq_file *m, void *v) +{ +	return show_smap(m, v, 1); +} + +static int show_tid_smap(struct seq_file *m, void *v) +{ +	return show_smap(m, v, 0); +} +  static const struct seq_operations proc_pid_smaps_op = {  	.start	= m_start,  	.next	= m_next,  	.stop	= m_stop, -	.show	= show_smap +	.show	= show_pid_smap +}; + +static const struct seq_operations proc_tid_smaps_op = { +	.start	= m_start, +	.next	= m_next, +	.stop	= m_stop, +	.show	= show_tid_smap  }; -static int smaps_open(struct inode *inode, struct file *file) +static int pid_smaps_open(struct inode *inode, struct file *file)  {  	return do_maps_open(inode, file, &proc_pid_smaps_op);  } -const struct file_operations proc_smaps_operations = { -	.open		= smaps_open, +static int tid_smaps_open(struct inode *inode, struct file *file) +{ +	return do_maps_open(inode, file, &proc_tid_smaps_op); +} + +const struct file_operations proc_pid_smaps_operations = { +	.open		= pid_smaps_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release_private, +}; + +const struct file_operations proc_tid_smaps_operations = { +	.open		= tid_smaps_open,  	.read		= seq_read,  	.llseek		= seq_lseek,  	.release	= seq_release_private, @@ -507,6 +584,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,  	struct page *page;  	split_huge_page_pmd(walk->mm, pmd); +	if (pmd_trans_unstable(pmd)) +		return 0;  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);  	for (; addr != end; pte++, addr += PAGE_SIZE) { @@ -518,6 +597,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,  		if (!page)  			continue; +		if (PageReserved(page)) +			continue; +  		/* Clear accessed and referenced bits. */  		ptep_test_and_clear_young(vma, addr, pte);  		ClearPageReferenced(page); @@ -595,11 +677,18 @@ const struct file_operations proc_clear_refs_operations = {  	.llseek		= noop_llseek,  }; +typedef struct { +	u64 pme; +} pagemap_entry_t; +  struct pagemapread {  	int pos, len; -	u64 *buffer; +	pagemap_entry_t *buffer;  }; +#define PAGEMAP_WALK_SIZE	(PMD_SIZE) +#define PAGEMAP_WALK_MASK	(PMD_MASK) +  #define PM_ENTRY_BYTES      sizeof(u64)  #define PM_STATUS_BITS      3  #define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS) @@ -617,10 +706,15 @@ struct pagemapread {  #define PM_NOT_PRESENT      PM_PSHIFT(PAGE_SHIFT)  #define PM_END_OF_BUFFER    1 -static int add_to_pagemap(unsigned long addr, u64 pfn, +static inline pagemap_entry_t make_pme(u64 val) +{ +	return (pagemap_entry_t) { .pme = val }; +} + +static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,  			  struct pagemapread *pm)  { -	pm->buffer[pm->pos++] = pfn; +	pm->buffer[pm->pos++] = *pme;  	if (pm->pos >= pm->len)  		return PM_END_OF_BUFFER;  	return 0; @@ -632,8 +726,10 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,  	struct pagemapread *pm = walk->private;  	unsigned long addr;  	int err = 0; +	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); +  	for (addr = start; addr < end; addr += PAGE_SIZE) { -		err = add_to_pagemap(addr, PM_NOT_PRESENT, pm); +		err = add_to_pagemap(addr, &pme, pm);  		if (err)  			break;  	} @@ -646,18 +742,36 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)  	return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);  } -static u64 pte_to_pagemap_entry(pte_t pte) +static void pte_to_pagemap_entry(pagemap_entry_t *pme, pte_t pte)  { -	u64 pme = 0;  	if (is_swap_pte(pte)) -		pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte)) -			| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP; +		*pme = make_pme(PM_PFRAME(swap_pte_to_pagemap_entry(pte)) +				| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP);  	else if (pte_present(pte)) -		pme = PM_PFRAME(pte_pfn(pte)) -			| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; -	return pme; +		*pme = make_pme(PM_PFRAME(pte_pfn(pte)) +				| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);  } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, +					pmd_t pmd, int offset) +{ +	/* +	 * Currently pmd for thp is always present because thp can not be +	 * swapped-out, migrated, or HWPOISONed (split in such cases instead.) +	 * This if-check is just to prepare for future implementation. +	 */ +	if (pmd_present(pmd)) +		*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) +				| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); +} +#else +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, +						pmd_t pmd, int offset) +{ +} +#endif +  static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,  			     struct mm_walk *walk)  { @@ -665,13 +779,30 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,  	struct pagemapread *pm = walk->private;  	pte_t *pte;  	int err = 0; +	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); -	split_huge_page_pmd(walk->mm, pmd); +	if (pmd_trans_unstable(pmd)) +		return 0;  	/* find the first VMA at or above 'addr' */  	vma = find_vma(walk->mm, addr); +	spin_lock(&walk->mm->page_table_lock); +	if (pmd_trans_huge_lock(pmd, vma) == 1) { +		for (; addr != end; addr += PAGE_SIZE) { +			unsigned long offset; + +			offset = (addr & ~PAGEMAP_WALK_MASK) >> +					PAGE_SHIFT; +			thp_pmd_to_pagemap_entry(&pme, *pmd, offset); +			err = add_to_pagemap(addr, &pme, pm); +			if (err) +				break; +		} +		spin_unlock(&walk->mm->page_table_lock); +		return err; +	} +  	for (; addr != end; addr += PAGE_SIZE) { -		u64 pfn = PM_NOT_PRESENT;  		/* check to see if we've left 'vma' behind  		 * and need a new, higher one */ @@ -683,11 +814,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,  		if (vma && (vma->vm_start <= addr) &&  		    !is_vm_hugetlb_page(vma)) {  			pte = pte_offset_map(pmd, addr); -			pfn = pte_to_pagemap_entry(*pte); +			pte_to_pagemap_entry(&pme, *pte);  			/* unmap before userspace copy */  			pte_unmap(pte);  		} -		err = add_to_pagemap(addr, pfn, pm); +		err = add_to_pagemap(addr, &pme, pm);  		if (err)  			return err;  	} @@ -698,13 +829,12 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,  }  #ifdef CONFIG_HUGETLB_PAGE -static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, +					pte_t pte, int offset)  { -	u64 pme = 0;  	if (pte_present(pte)) -		pme = PM_PFRAME(pte_pfn(pte) + offset) -			| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; -	return pme; +		*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) +				| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);  }  /* This function walks within one hugetlb entry in the single call */ @@ -714,12 +844,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,  {  	struct pagemapread *pm = walk->private;  	int err = 0; -	u64 pfn; +	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);  	for (; addr != end; addr += PAGE_SIZE) {  		int offset = (addr & ~hmask) >> PAGE_SHIFT; -		pfn = huge_pte_to_pagemap_entry(*pte, offset); -		err = add_to_pagemap(addr, pfn, pm); +		huge_pte_to_pagemap_entry(&pme, *pte, offset); +		err = add_to_pagemap(addr, &pme, pm);  		if (err)  			return err;  	} @@ -754,8 +884,6 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,   * determine which areas of memory are actually mapped and llseek to   * skip over unmapped regions.   */ -#define PAGEMAP_WALK_SIZE	(PMD_SIZE) -#define PAGEMAP_WALK_MASK	(PMD_MASK)  static ssize_t pagemap_read(struct file *file, char __user *buf,  			    size_t count, loff_t *ppos)  { @@ -938,26 +1066,21 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,  	pte_t *pte;  	md = walk->private; -	spin_lock(&walk->mm->page_table_lock); -	if (pmd_trans_huge(*pmd)) { -		if (pmd_trans_splitting(*pmd)) { -			spin_unlock(&walk->mm->page_table_lock); -			wait_split_huge_page(md->vma->anon_vma, pmd); -		} else { -			pte_t huge_pte = *(pte_t *)pmd; -			struct page *page; -			page = can_gather_numa_stats(huge_pte, md->vma, addr); -			if (page) -				gather_stats(page, md, pte_dirty(huge_pte), -						HPAGE_PMD_SIZE/PAGE_SIZE); -			spin_unlock(&walk->mm->page_table_lock); -			return 0; -		} -	} else { +	if (pmd_trans_huge_lock(pmd, md->vma) == 1) { +		pte_t huge_pte = *(pte_t *)pmd; +		struct page *page; + +		page = can_gather_numa_stats(huge_pte, md->vma, addr); +		if (page) +			gather_stats(page, md, pte_dirty(huge_pte), +				     HPAGE_PMD_SIZE/PAGE_SIZE);  		spin_unlock(&walk->mm->page_table_lock); +		return 0;  	} +	if (pmd_trans_unstable(pmd)) +		return 0;  	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);  	do {  		struct page *page = can_gather_numa_stats(*pte, md->vma, addr); @@ -999,7 +1122,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,  /*   * Display pages allocated per node and memory policy via /proc.   */ -static int show_numa_map(struct seq_file *m, void *v) +static int show_numa_map(struct seq_file *m, void *v, int is_pid)  {  	struct numa_maps_private *numa_priv = m->private;  	struct proc_maps_private *proc_priv = &numa_priv->proc_maps; @@ -1036,9 +1159,19 @@ static int show_numa_map(struct seq_file *m, void *v)  		seq_path(m, &file->f_path, "\n\t= ");  	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {  		seq_printf(m, " heap"); -	} else if (vma->vm_start <= mm->start_stack && -			vma->vm_end >= mm->start_stack) { -		seq_printf(m, " stack"); +	} else { +		pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid); +		if (tid != 0) { +			/* +			 * Thread stack in /proc/PID/task/TID/maps or +			 * the main process stack. +			 */ +			if (!is_pid || (vma->vm_start <= mm->start_stack && +			    vma->vm_end >= mm->start_stack)) +				seq_printf(m, " stack"); +			else +				seq_printf(m, " stack:%d", tid); +		}  	}  	if (is_vm_hugetlb_page(vma)) @@ -1081,21 +1214,39 @@ out:  	return 0;  } +static int show_pid_numa_map(struct seq_file *m, void *v) +{ +	return show_numa_map(m, v, 1); +} + +static int show_tid_numa_map(struct seq_file *m, void *v) +{ +	return show_numa_map(m, v, 0); +} +  static const struct seq_operations proc_pid_numa_maps_op = { -        .start  = m_start, -        .next   = m_next, -        .stop   = m_stop, -        .show   = show_numa_map, +	.start  = m_start, +	.next   = m_next, +	.stop   = m_stop, +	.show   = show_pid_numa_map, +}; + +static const struct seq_operations proc_tid_numa_maps_op = { +	.start  = m_start, +	.next   = m_next, +	.stop   = m_stop, +	.show   = show_tid_numa_map,  }; -static int numa_maps_open(struct inode *inode, struct file *file) +static int numa_maps_open(struct inode *inode, struct file *file, +			  const struct seq_operations *ops)  {  	struct numa_maps_private *priv;  	int ret = -ENOMEM;  	priv = kzalloc(sizeof(*priv), GFP_KERNEL);  	if (priv) {  		priv->proc_maps.pid = proc_pid(inode); -		ret = seq_open(file, &proc_pid_numa_maps_op); +		ret = seq_open(file, ops);  		if (!ret) {  			struct seq_file *m = file->private_data;  			m->private = priv; @@ -1106,8 +1257,25 @@ static int numa_maps_open(struct inode *inode, struct file *file)  	return ret;  } -const struct file_operations proc_numa_maps_operations = { -	.open		= numa_maps_open, +static int pid_numa_maps_open(struct inode *inode, struct file *file) +{ +	return numa_maps_open(inode, file, &proc_pid_numa_maps_op); +} + +static int tid_numa_maps_open(struct inode *inode, struct file *file) +{ +	return numa_maps_open(inode, file, &proc_tid_numa_maps_op); +} + +const struct file_operations proc_pid_numa_maps_operations = { +	.open		= pid_numa_maps_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release_private, +}; + +const struct file_operations proc_tid_numa_maps_operations = { +	.open		= tid_numa_maps_open,  	.read		= seq_read,  	.llseek		= seq_lseek,  	.release	= seq_release_private, diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 980de547c07..74fe164d1b2 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -134,9 +134,11 @@ static void pad_len_spaces(struct seq_file *m, int len)  /*   * display a single VMA to a sequenced file   */ -static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, +			  int is_pid)  {  	struct mm_struct *mm = vma->vm_mm; +	struct proc_maps_private *priv = m->private;  	unsigned long ino = 0;  	struct file *file;  	dev_t dev = 0; @@ -168,10 +170,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)  		pad_len_spaces(m, len);  		seq_path(m, &file->f_path, "");  	} else if (mm) { -		if (vma->vm_start <= mm->start_stack && -			vma->vm_end >= mm->start_stack) { +		pid_t tid = vm_is_stack(priv->task, vma, is_pid); + +		if (tid != 0) {  			pad_len_spaces(m, len); -			seq_puts(m, "[stack]"); +			/* +			 * Thread stack in /proc/PID/task/TID/maps or +			 * the main process stack. +			 */ +			if (!is_pid || (vma->vm_start <= mm->start_stack && +			    vma->vm_end >= mm->start_stack)) +				seq_printf(m, "[stack]"); +			else +				seq_printf(m, "[stack:%d]", tid);  		}  	} @@ -182,11 +193,22 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)  /*   * display mapping lines for a particular process's /proc/pid/maps   */ -static int show_map(struct seq_file *m, void *_p) +static int show_map(struct seq_file *m, void *_p, int is_pid)  {  	struct rb_node *p = _p; -	return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); +	return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb), +			      is_pid); +} + +static int show_pid_map(struct seq_file *m, void *_p) +{ +	return show_map(m, _p, 1); +} + +static int show_tid_map(struct seq_file *m, void *_p) +{ +	return show_map(m, _p, 0);  }  static void *m_start(struct seq_file *m, loff_t *pos) @@ -240,10 +262,18 @@ static const struct seq_operations proc_pid_maps_ops = {  	.start	= m_start,  	.next	= m_next,  	.stop	= m_stop, -	.show	= show_map +	.show	= show_pid_map +}; + +static const struct seq_operations proc_tid_maps_ops = { +	.start	= m_start, +	.next	= m_next, +	.stop	= m_stop, +	.show	= show_tid_map  }; -static int maps_open(struct inode *inode, struct file *file) +static int maps_open(struct inode *inode, struct file *file, +		     const struct seq_operations *ops)  {  	struct proc_maps_private *priv;  	int ret = -ENOMEM; @@ -251,7 +281,7 @@ static int maps_open(struct inode *inode, struct file *file)  	priv = kzalloc(sizeof(*priv), GFP_KERNEL);  	if (priv) {  		priv->pid = proc_pid(inode); -		ret = seq_open(file, &proc_pid_maps_ops); +		ret = seq_open(file, ops);  		if (!ret) {  			struct seq_file *m = file->private_data;  			m->private = priv; @@ -262,8 +292,25 @@ static int maps_open(struct inode *inode, struct file *file)  	return ret;  } -const struct file_operations proc_maps_operations = { -	.open		= maps_open, +static int pid_maps_open(struct inode *inode, struct file *file) +{ +	return maps_open(inode, file, &proc_pid_maps_ops); +} + +static int tid_maps_open(struct inode *inode, struct file *file) +{ +	return maps_open(inode, file, &proc_tid_maps_ops); +} + +const struct file_operations proc_pid_maps_operations = { +	.open		= pid_maps_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= seq_release_private, +}; + +const struct file_operations proc_tid_maps_operations = { +	.open		= tid_maps_open,  	.read		= seq_read,  	.llseek		= seq_lseek,  	.release	= seq_release_private, diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index b0f450a2bb7..0d5071d2998 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -700,3 +700,26 @@ static int __init vmcore_init(void)  	return 0;  }  module_init(vmcore_init) + +/* Cleanup function for vmcore module. */ +void vmcore_cleanup(void) +{ +	struct list_head *pos, *next; + +	if (proc_vmcore) { +		remove_proc_entry(proc_vmcore->name, proc_vmcore->parent); +		proc_vmcore = NULL; +	} + +	/* clear the vmcore list. */ +	list_for_each_safe(pos, next, &vmcore_list) { +		struct vmcore *m; + +		m = list_entry(pos, struct vmcore, list); +		list_del(&m->list); +		kfree(m); +	} +	kfree(elfcorebuf); +	elfcorebuf = NULL; +} +EXPORT_SYMBOL_GPL(vmcore_cleanup); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index b3b426edb2f..f37c32b9452 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -278,9 +278,7 @@ fail:  int pstore_fill_super(struct super_block *sb, void *data, int silent)  { -	struct inode *inode = NULL; -	struct dentry *root; -	int err; +	struct inode *inode;  	save_mount_options(sb, data); @@ -296,26 +294,17 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent)  	parse_options(data);  	inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0); -	if (!inode) { -		err = -ENOMEM; -		goto fail; -	} -	/* override ramfs "dir" options so we catch unlink(2) */ -	inode->i_op = &pstore_dir_inode_operations; - -	root = d_alloc_root(inode); -	sb->s_root = root; -	if (!root) { -		err = -ENOMEM; -		goto fail; +	if (inode) { +		/* override ramfs "dir" options so we catch unlink(2) */ +		inode->i_op = &pstore_dir_inode_operations;  	} +	sb->s_root = d_make_root(inode); +	if (!sb->s_root) +		return -ENOMEM;  	pstore_get_records(0);  	return 0; -fail: -	iput(inode); -	return err;  }  static struct dentry *pstore_mount(struct file_system_type *fs_type, diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 9ec22d3b429..82c585f715e 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -68,9 +68,25 @@ void pstore_set_kmsg_bytes(int bytes)  /* Tag each group of saved records with a sequence number */  static int	oopscount; -static char *reason_str[] = { -	"Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency" -}; +static const char *get_reason_str(enum kmsg_dump_reason reason) +{ +	switch (reason) { +	case KMSG_DUMP_PANIC: +		return "Panic"; +	case KMSG_DUMP_OOPS: +		return "Oops"; +	case KMSG_DUMP_EMERG: +		return "Emergency"; +	case KMSG_DUMP_RESTART: +		return "Restart"; +	case KMSG_DUMP_HALT: +		return "Halt"; +	case KMSG_DUMP_POWEROFF: +		return "Poweroff"; +	default: +		return "Unknown"; +	} +}  /*   * callback from kmsg_dump. (s2,l2) has the most recently @@ -85,17 +101,15 @@ static void pstore_dump(struct kmsg_dumper *dumper,  	unsigned long	s1_start, s2_start;  	unsigned long	l1_cpy, l2_cpy;  	unsigned long	size, total = 0; -	char		*dst, *why; +	char		*dst; +	const char	*why;  	u64		id;  	int		hsize, ret;  	unsigned int	part = 1;  	unsigned long	flags = 0;  	int		is_locked = 0; -	if (reason < ARRAY_SIZE(reason_str)) -		why = reason_str[reason]; -	else -		why = "Unknown"; +	why = get_reason_str(reason);  	if (in_nmi()) {  		is_locked = spin_trylock(&psinfo->buf_lock); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 6b009548d2e..552e994e3aa 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -52,38 +52,6 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)  	return 0;  } -static struct buffer_head *qnx4_getblk(struct inode *inode, int nr, -				       int create) -{ -	struct buffer_head *result = NULL; - -	if ( nr >= 0 ) -		nr = qnx4_block_map( inode, nr ); -	if (nr) { -		result = sb_getblk(inode->i_sb, nr); -		return result; -	} -	return NULL; -} - -struct buffer_head *qnx4_bread(struct inode *inode, int block, int create) -{ -	struct buffer_head *bh; - -	bh = qnx4_getblk(inode, block, create); -	if (!bh || buffer_uptodate(bh)) { -		return bh; -	} -	ll_rw_block(READ, 1, &bh); -	wait_on_buffer(bh); -	if (buffer_uptodate(bh)) { -		return bh; -	} -	brelse(bh); - -	return NULL; -} -  static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create )  {  	unsigned long phys; @@ -98,23 +66,31 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h  	return 0;  } +static inline u32 try_extent(qnx4_xtnt_t *extent, u32 *offset) +{ +	u32 size = le32_to_cpu(extent->xtnt_size); +	if (*offset < size) +		return le32_to_cpu(extent->xtnt_blk) + *offset - 1; +	*offset -= size; +	return 0; +} +  unsigned long qnx4_block_map( struct inode *inode, long iblock )  {  	int ix; -	long offset, i_xblk; -	unsigned long block = 0; +	long i_xblk;  	struct buffer_head *bh = NULL;  	struct qnx4_xblk *xblk = NULL;  	struct qnx4_inode_entry *qnx4_inode = qnx4_raw_inode(inode);  	u16 nxtnt = le16_to_cpu(qnx4_inode->di_num_xtnts); +	u32 offset = iblock; +	u32 block = try_extent(&qnx4_inode->di_first_xtnt, &offset); -	if ( iblock < le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_size) ) { +	if (block) {  		// iblock is in the first extent. This is easy. -		block = le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_blk) + iblock - 1;  	} else {  		// iblock is beyond first extent. We have to follow the extent chain.  		i_xblk = le32_to_cpu(qnx4_inode->di_xblk); -		offset = iblock - le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_size);  		ix = 0;  		while ( --nxtnt > 0 ) {  			if ( ix == 0 ) { @@ -130,12 +106,11 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )  					return -EIO;  				}  			} -			if ( offset < le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_size) ) { +			block = try_extent(&xblk->xblk_xtnts[ix], &offset); +			if (block) {  				// got it! -				block = le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_blk) + offset - 1;  				break;  			} -			offset -= le32_to_cpu(xblk->xblk_xtnts[ix].xtnt_size);  			if ( ++ix >= xblk->xblk_num_xtnts ) {  				i_xblk = le32_to_cpu(xblk->xblk_next_xblk);  				ix = 0; @@ -260,15 +235,13 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)   	}  	ret = -ENOMEM; - 	s->s_root = d_alloc_root(root); + 	s->s_root = d_make_root(root);   	if (s->s_root == NULL) - 		goto outi; + 		goto outb;  	brelse(bh);  	return 0; -      outi: -	iput(root);        outb:  	kfree(qs->BitMap);        out: @@ -288,44 +261,17 @@ static void qnx4_put_super(struct super_block *sb)  	return;  } -static int qnx4_writepage(struct page *page, struct writeback_control *wbc) -{ -	return block_write_full_page(page,qnx4_get_block, wbc); -} -  static int qnx4_readpage(struct file *file, struct page *page)  {  	return block_read_full_page(page,qnx4_get_block);  } -static int qnx4_write_begin(struct file *file, struct address_space *mapping, -			loff_t pos, unsigned len, unsigned flags, -			struct page **pagep, void **fsdata) -{ -	struct qnx4_inode_info *qnx4_inode = qnx4_i(mapping->host); -	int ret; - -	*pagep = NULL; -	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, -				qnx4_get_block, -				&qnx4_inode->mmu_private); -	if (unlikely(ret)) { -		loff_t isize = mapping->host->i_size; -		if (pos + len > isize) -			vmtruncate(mapping->host, isize); -	} - -	return ret; -}  static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)  {  	return generic_block_bmap(mapping,block,qnx4_get_block);  }  static const struct address_space_operations qnx4_aops = {  	.readpage	= qnx4_readpage, -	.writepage	= qnx4_writepage, -	.write_begin	= qnx4_write_begin, -	.write_end	= generic_write_end,  	.bmap		= qnx4_bmap  }; diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c index 275327b5615..a512c0b30e8 100644 --- a/fs/qnx4/namei.c +++ b/fs/qnx4/namei.c @@ -39,10 +39,6 @@ static int qnx4_match(int len, const char *name,  	} else {  		namelen = QNX4_SHORT_NAME_MAX;  	} -	/* "" means "." ---> so paths like "/usr/lib//libc.a" work */ -	if (!len && (de->di_fname[0] == '.') && (de->di_fname[1] == '\0')) { -		return 1; -	}  	thislen = strlen( de->di_fname );  	if ( thislen > namelen )  		thislen = namelen; @@ -72,7 +68,9 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,  	block = offset = blkofs = 0;  	while (blkofs * QNX4_BLOCK_SIZE + offset < dir->i_size) {  		if (!bh) { -			bh = qnx4_bread(dir, blkofs, 0); +			block = qnx4_block_map(dir, blkofs); +			if (block) +				bh = sb_bread(dir->i_sb, block);  			if (!bh) {  				blkofs++;  				continue; @@ -80,7 +78,6 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,  		}  		*res_dir = (struct qnx4_inode_entry *) (bh->b_data + offset);  		if (qnx4_match(len, name, bh, &offset)) { -			block = qnx4_block_map( dir, blkofs );  			*ino = block * QNX4_INODES_PER_BLOCK +  			    (offset / QNX4_DIR_ENTRY_SIZE) - 1;  			return bh; diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h index 33a60858203..244d4620189 100644 --- a/fs/qnx4/qnx4.h +++ b/fs/qnx4/qnx4.h @@ -27,8 +27,6 @@ extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, stru  extern unsigned long qnx4_count_free_blocks(struct super_block *sb);  extern unsigned long qnx4_block_map(struct inode *inode, long iblock); -extern struct buffer_head *qnx4_bread(struct inode *, int, int); -  extern const struct inode_operations qnx4_dir_inode_operations;  extern const struct file_operations qnx4_dir_operations;  extern int qnx4_is_free(struct super_block *sb, long block); diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig new file mode 100644 index 00000000000..edbba5c17cc --- /dev/null +++ b/fs/qnx6/Kconfig @@ -0,0 +1,26 @@ +config QNX6FS_FS +	tristate "QNX6 file system support (read only)" +	depends on BLOCK && CRC32 +	help +	  This is the file system used by the real-time operating systems +	  QNX 6 (also called QNX RTP). +	  Further information is available at <http://www.qnx.com/>. +	  Say Y if you intend to mount QNX hard disks or floppies formatted +          with a mkqnx6fs. +	  However, keep in mind that this currently is a readonly driver! + +	  To compile this file system support as a module, choose M here: the +	  module will be called qnx6. + +	  If you don't know whether you need it, then you don't need it: +	  answer N. + +config QNX6FS_DEBUG +	bool "QNX6 debugging information" +	depends on QNX6FS_FS +	help +	  Turns on extended debugging output. + +	  If you are not a developer working on the QNX6FS, you probably don't +	  want this: +	  answer N. diff --git a/fs/qnx6/Makefile b/fs/qnx6/Makefile new file mode 100644 index 00000000000..9dd06199afc --- /dev/null +++ b/fs/qnx6/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux qnx4-filesystem routines. +# + +obj-$(CONFIG_QNX6FS_FS) += qnx6.o + +qnx6-objs := inode.o dir.o namei.o super_mmi.o diff --git a/fs/qnx6/README b/fs/qnx6/README new file mode 100644 index 00000000000..116d622026c --- /dev/null +++ b/fs/qnx6/README @@ -0,0 +1,8 @@ + +  This is a snapshot of the QNX6 filesystem for Linux. +  Please send diffs and remarks to <chaosman@ontika.net> . + +Credits : + +Al Viro		<viro@ZenIV.linux.org.uk> (endless patience with me & support ;)) +Kai Bankett	<chaosman@ontika.net> (Maintainer) diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c new file mode 100644 index 00000000000..dc597353db3 --- /dev/null +++ b/fs/qnx6/dir.c @@ -0,0 +1,291 @@ +/* + * QNX6 file system, Linux implementation. + * + * Version : 1.0.0 + * + * History : + * + * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. + * 16-02-2012 pagemap extension by Al Viro + * + */ + +#include "qnx6.h" + +static unsigned qnx6_lfile_checksum(char *name, unsigned size) +{ +	unsigned crc = 0; +	char *end = name + size; +	while (name < end) { +		crc = ((crc >> 1) + *(name++)) ^ +			((crc & 0x00000001) ? 0x80000000 : 0); +	} +	return crc; +} + +static struct page *qnx6_get_page(struct inode *dir, unsigned long n) +{ +	struct address_space *mapping = dir->i_mapping; +	struct page *page = read_mapping_page(mapping, n, NULL); +	if (!IS_ERR(page)) +		kmap(page); +	return page; +} + +static inline unsigned long dir_pages(struct inode *inode) +{ +	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +static unsigned last_entry(struct inode *inode, unsigned long page_nr) +{ +	unsigned long last_byte = inode->i_size; +	last_byte -= page_nr << PAGE_CACHE_SHIFT; +	if (last_byte > PAGE_CACHE_SIZE) +		last_byte = PAGE_CACHE_SIZE; +	return last_byte / QNX6_DIR_ENTRY_SIZE; +} + +static struct qnx6_long_filename *qnx6_longname(struct super_block *sb, +					 struct qnx6_long_dir_entry *de, +					 struct page **p) +{ +	struct qnx6_sb_info *sbi = QNX6_SB(sb); +	u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */ +	u32 n = s >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); /* in pages */ +	/* within page */ +	u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_CACHE_MASK; +	struct address_space *mapping = sbi->longfile->i_mapping; +	struct page *page = read_mapping_page(mapping, n, NULL); +	if (IS_ERR(page)) +		return ERR_CAST(page); +	kmap(*p = page); +	return (struct qnx6_long_filename *)(page_address(page) + offs); +} + +static int qnx6_dir_longfilename(struct inode *inode, +			struct qnx6_long_dir_entry *de, +			void *dirent, loff_t pos, +			unsigned de_inode, filldir_t filldir) +{ +	struct qnx6_long_filename *lf; +	struct super_block *s = inode->i_sb; +	struct qnx6_sb_info *sbi = QNX6_SB(s); +	struct page *page; +	int lf_size; + +	if (de->de_size != 0xff) { +		/* error - long filename entries always have size 0xff +		   in direntry */ +		printk(KERN_ERR "qnx6: invalid direntry size (%i).\n", +				de->de_size); +		return 0; +	} +	lf = qnx6_longname(s, de, &page); +	if (IS_ERR(lf)) { +		printk(KERN_ERR "qnx6:Error reading longname\n"); +		return 0; +	} + +	lf_size = fs16_to_cpu(sbi, lf->lf_size); + +	if (lf_size > QNX6_LONG_NAME_MAX) { +		QNX6DEBUG((KERN_INFO "file %s\n", lf->lf_fname)); +		printk(KERN_ERR "qnx6:Filename too long (%i)\n", lf_size); +		qnx6_put_page(page); +		return 0; +	} + +	/* calc & validate longfilename checksum +	   mmi 3g filesystem does not have that checksum */ +	if (!test_opt(s, MMI_FS) && fs32_to_cpu(sbi, de->de_checksum) != +			qnx6_lfile_checksum(lf->lf_fname, lf_size)) +		printk(KERN_INFO "qnx6: long filename checksum error.\n"); + +	QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n", +					lf_size, lf->lf_fname, de_inode)); +	if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode, +			DT_UNKNOWN) < 0) { +		qnx6_put_page(page); +		return 0; +	} + +	qnx6_put_page(page); +	/* success */ +	return 1; +} + +static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ +	struct inode *inode = filp->f_path.dentry->d_inode; +	struct super_block *s = inode->i_sb; +	struct qnx6_sb_info *sbi = QNX6_SB(s); +	loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1); +	unsigned long npages = dir_pages(inode); +	unsigned long n = pos >> PAGE_CACHE_SHIFT; +	unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE; +	bool done = false; + +	if (filp->f_pos >= inode->i_size) +		return 0; + +	for ( ; !done && n < npages; n++, start = 0) { +		struct page *page = qnx6_get_page(inode, n); +		int limit = last_entry(inode, n); +		struct qnx6_dir_entry *de; +		int i = start; + +		if (IS_ERR(page)) { +			printk(KERN_ERR "qnx6_readdir: read failed\n"); +			filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT; +			return PTR_ERR(page); +		} +		de = ((struct qnx6_dir_entry *)page_address(page)) + start; +		for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) { +			int size = de->de_size; +			u32 no_inode = fs32_to_cpu(sbi, de->de_inode); + +			if (!no_inode || !size) +				continue; + +			if (size > QNX6_SHORT_NAME_MAX) { +				/* long filename detected +				   get the filename from long filename +				   structure / block */ +				if (!qnx6_dir_longfilename(inode, +					(struct qnx6_long_dir_entry *)de, +					dirent, pos, no_inode, +					filldir)) { +					done = true; +					break; +				} +			} else { +				QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s" +				   " inode:%u\n", size, de->de_fname, +							no_inode)); +				if (filldir(dirent, de->de_fname, size, +				      pos, no_inode, DT_UNKNOWN) +					< 0) { +					done = true; +					break; +				} +			} +		} +		qnx6_put_page(page); +	} +	filp->f_pos = pos; +	return 0; +} + +/* + * check if the long filename is correct. + */ +static unsigned qnx6_long_match(int len, const char *name, +			struct qnx6_long_dir_entry *de, struct inode *dir) +{ +	struct super_block *s = dir->i_sb; +	struct qnx6_sb_info *sbi = QNX6_SB(s); +	struct page *page; +	int thislen; +	struct qnx6_long_filename *lf = qnx6_longname(s, de, &page); + +	if (IS_ERR(lf)) +		return 0; + +	thislen = fs16_to_cpu(sbi, lf->lf_size); +	if (len != thislen) { +		qnx6_put_page(page); +		return 0; +	} +	if (memcmp(name, lf->lf_fname, len) == 0) { +		qnx6_put_page(page); +		return fs32_to_cpu(sbi, de->de_inode); +	} +	qnx6_put_page(page); +	return 0; +} + +/* + * check if the filename is correct. + */ +static unsigned qnx6_match(struct super_block *s, int len, const char *name, +			struct qnx6_dir_entry *de) +{ +	struct qnx6_sb_info *sbi = QNX6_SB(s); +	if (memcmp(name, de->de_fname, len) == 0) +		return fs32_to_cpu(sbi, de->de_inode); +	return 0; +} + + +unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, +			 struct page **res_page) +{ +	struct super_block *s = dir->i_sb; +	struct qnx6_inode_info *ei = QNX6_I(dir); +	struct page *page = NULL; +	unsigned long start, n; +	unsigned long npages = dir_pages(dir); +	unsigned ino; +	struct qnx6_dir_entry *de; +	struct qnx6_long_dir_entry *lde; + +	*res_page = NULL; + +	if (npages == 0) +		return 0; +	start = ei->i_dir_start_lookup; +	if (start >= npages) +		start = 0; +	n = start; + +	do { +		page = qnx6_get_page(dir, n); +		if (!IS_ERR(page)) { +			int limit = last_entry(dir, n); +			int i; + +			de = (struct qnx6_dir_entry *)page_address(page); +			for (i = 0; i < limit; i++, de++) { +				if (len <= QNX6_SHORT_NAME_MAX) { +					/* short filename */ +					if (len != de->de_size) +						continue; +					ino = qnx6_match(s, len, name, de); +					if (ino) +						goto found; +				} else if (de->de_size == 0xff) { +					/* deal with long filename */ +					lde = (struct qnx6_long_dir_entry *)de; +					ino = qnx6_long_match(len, +								name, lde, dir); +					if (ino) +						goto found; +				} else +					printk(KERN_ERR "qnx6: undefined " +						"filename size in inode.\n"); +			} +			qnx6_put_page(page); +		} + +		if (++n >= npages) +			n = 0; +	} while (n != start); +	return 0; + +found: +	*res_page = page; +	ei->i_dir_start_lookup = n; +	return ino; +} + +const struct file_operations qnx6_dir_operations = { +	.llseek		= generic_file_llseek, +	.read		= generic_read_dir, +	.readdir	= qnx6_readdir, +	.fsync		= generic_file_fsync, +}; + +const struct inode_operations qnx6_dir_inode_operations = { +	.lookup		= qnx6_lookup, +}; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c new file mode 100644 index 00000000000..e44012dc564 --- /dev/null +++ b/fs/qnx6/inode.c @@ -0,0 +1,698 @@ +/* + * QNX6 file system, Linux implementation. + * + * Version : 1.0.0 + * + * History : + * + * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. + * 16-02-2012 pagemap extension by Al Viro + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/highuid.h> +#include <linux/pagemap.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/parser.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/crc32.h> +#include <linux/mpage.h> +#include "qnx6.h" + +static const struct super_operations qnx6_sops; + +static void qnx6_put_super(struct super_block *sb); +static struct inode *qnx6_alloc_inode(struct super_block *sb); +static void qnx6_destroy_inode(struct inode *inode); +static int qnx6_remount(struct super_block *sb, int *flags, char *data); +static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf); +static int qnx6_show_options(struct seq_file *seq, struct dentry *root); + +static const struct super_operations qnx6_sops = { +	.alloc_inode	= qnx6_alloc_inode, +	.destroy_inode	= qnx6_destroy_inode, +	.put_super	= qnx6_put_super, +	.statfs		= qnx6_statfs, +	.remount_fs	= qnx6_remount, +	.show_options	= qnx6_show_options, +}; + +static int qnx6_show_options(struct seq_file *seq, struct dentry *root) +{ +	struct super_block *sb = root->d_sb; +	struct qnx6_sb_info *sbi = QNX6_SB(sb); + +	if (sbi->s_mount_opt & QNX6_MOUNT_MMI_FS) +		seq_puts(seq, ",mmi_fs"); +	return 0; +} + +static int qnx6_remount(struct super_block *sb, int *flags, char *data) +{ +	*flags |= MS_RDONLY; +	return 0; +} + +static unsigned qnx6_get_devblock(struct super_block *sb, __fs32 block) +{ +	struct qnx6_sb_info *sbi = QNX6_SB(sb); +	return fs32_to_cpu(sbi, block) + sbi->s_blks_off; +} + +static unsigned qnx6_block_map(struct inode *inode, unsigned iblock); + +static int qnx6_get_block(struct inode *inode, sector_t iblock, +			struct buffer_head *bh, int create) +{ +	unsigned phys; + +	QNX6DEBUG((KERN_INFO "qnx6: qnx6_get_block inode=[%ld] iblock=[%ld]\n", +			inode->i_ino, (unsigned long)iblock)); + +	phys = qnx6_block_map(inode, iblock); +	if (phys) { +		/* logical block is before EOF */ +		map_bh(bh, inode->i_sb, phys); +	} +	return 0; +} + +static int qnx6_check_blockptr(__fs32 ptr) +{ +	if (ptr == ~(__fs32)0) { +		printk(KERN_ERR "qnx6: hit unused blockpointer.\n"); +		return 0; +	} +	return 1; +} + +static int qnx6_readpage(struct file *file, struct page *page) +{ +	return mpage_readpage(page, qnx6_get_block); +} + +static int qnx6_readpages(struct file *file, struct address_space *mapping, +		   struct list_head *pages, unsigned nr_pages) +{ +	return mpage_readpages(mapping, pages, nr_pages, qnx6_get_block); +} + +/* + * returns the block number for the no-th element in the tree + * inodebits requred as there are multiple inodes in one inode block + */ +static unsigned qnx6_block_map(struct inode *inode, unsigned no) +{ +	struct super_block *s = inode->i_sb; +	struct qnx6_sb_info *sbi = QNX6_SB(s); +	struct qnx6_inode_info *ei = QNX6_I(inode); +	unsigned block = 0; +	struct buffer_head *bh; +	__fs32 ptr; +	int levelptr; +	int ptrbits = sbi->s_ptrbits; +	int bitdelta; +	u32 mask = (1 << ptrbits) - 1; +	int depth = ei->di_filelevels; +	int i; + +	bitdelta = ptrbits * depth; +	levelptr = no >> bitdelta; + +	if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) { +		printk(KERN_ERR "qnx6:Requested file block number (%u) too big.", +				no); +		return 0; +	} + +	block = qnx6_get_devblock(s, ei->di_block_ptr[levelptr]); + +	for (i = 0; i < depth; i++) { +		bh = sb_bread(s, block); +		if (!bh) { +			printk(KERN_ERR "qnx6:Error reading block (%u)\n", +					block); +			return 0; +		} +		bitdelta -= ptrbits; +		levelptr = (no >> bitdelta) & mask; +		ptr = ((__fs32 *)bh->b_data)[levelptr]; + +		if (!qnx6_check_blockptr(ptr)) +			return 0; + +		block = qnx6_get_devblock(s, ptr); +		brelse(bh); +	} +	return block; +} + +static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf) +{ +	struct super_block *sb = dentry->d_sb; +	struct qnx6_sb_info *sbi = QNX6_SB(sb); +	u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + +	buf->f_type    = sb->s_magic; +	buf->f_bsize   = sb->s_blocksize; +	buf->f_blocks  = fs32_to_cpu(sbi, sbi->sb->sb_num_blocks); +	buf->f_bfree   = fs32_to_cpu(sbi, sbi->sb->sb_free_blocks); +	buf->f_files   = fs32_to_cpu(sbi, sbi->sb->sb_num_inodes); +	buf->f_ffree   = fs32_to_cpu(sbi, sbi->sb->sb_free_inodes); +	buf->f_bavail  = buf->f_bfree; +	buf->f_namelen = QNX6_LONG_NAME_MAX; +	buf->f_fsid.val[0] = (u32)id; +	buf->f_fsid.val[1] = (u32)(id >> 32); + +	return 0; +} + +/* + * Check the root directory of the filesystem to make sure + * it really _is_ a qnx6 filesystem, and to check the size + * of the directory entry. + */ +static const char *qnx6_checkroot(struct super_block *s) +{ +	static char match_root[2][3] = {".\0\0", "..\0"}; +	int i, error = 0; +	struct qnx6_dir_entry *dir_entry; +	struct inode *root = s->s_root->d_inode; +	struct address_space *mapping = root->i_mapping; +	struct page *page = read_mapping_page(mapping, 0, NULL); +	if (IS_ERR(page)) +		return "error reading root directory"; +	kmap(page); +	dir_entry = page_address(page); +	for (i = 0; i < 2; i++) { +		/* maximum 3 bytes - due to match_root limitation */ +		if (strncmp(dir_entry[i].de_fname, match_root[i], 3)) +			error = 1; +	} +	qnx6_put_page(page); +	if (error) +		return "error reading root directory."; +	return NULL; +} + +#ifdef CONFIG_QNX6FS_DEBUG +void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s) +{ +	struct qnx6_sb_info *sbi = QNX6_SB(s); + +	QNX6DEBUG((KERN_INFO "magic: %08x\n", +				fs32_to_cpu(sbi, sb->sb_magic))); +	QNX6DEBUG((KERN_INFO "checksum: %08x\n", +				fs32_to_cpu(sbi, sb->sb_checksum))); +	QNX6DEBUG((KERN_INFO "serial: %llx\n", +				fs64_to_cpu(sbi, sb->sb_serial))); +	QNX6DEBUG((KERN_INFO "flags: %08x\n", +				fs32_to_cpu(sbi, sb->sb_flags))); +	QNX6DEBUG((KERN_INFO "blocksize: %08x\n", +				fs32_to_cpu(sbi, sb->sb_blocksize))); +	QNX6DEBUG((KERN_INFO "num_inodes: %08x\n", +				fs32_to_cpu(sbi, sb->sb_num_inodes))); +	QNX6DEBUG((KERN_INFO "free_inodes: %08x\n", +				fs32_to_cpu(sbi, sb->sb_free_inodes))); +	QNX6DEBUG((KERN_INFO "num_blocks: %08x\n", +				fs32_to_cpu(sbi, sb->sb_num_blocks))); +	QNX6DEBUG((KERN_INFO "free_blocks: %08x\n", +				fs32_to_cpu(sbi, sb->sb_free_blocks))); +	QNX6DEBUG((KERN_INFO "inode_levels: %02x\n", +				sb->Inode.levels)); +} +#endif + +enum { +	Opt_mmifs, +	Opt_err +}; + +static const match_table_t tokens = { +	{Opt_mmifs, "mmi_fs"}, +	{Opt_err, NULL} +}; + +static int qnx6_parse_options(char *options, struct super_block *sb) +{ +	char *p; +	struct qnx6_sb_info *sbi = QNX6_SB(sb); +	substring_t args[MAX_OPT_ARGS]; + +	if (!options) +		return 1; + +	while ((p = strsep(&options, ",")) != NULL) { +		int token; +		if (!*p) +			continue; + +		token = match_token(p, tokens, args); +		switch (token) { +		case Opt_mmifs: +			set_opt(sbi->s_mount_opt, MMI_FS); +			break; +		default: +			return 0; +		} +	} +	return 1; +} + +static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, +				int offset, int silent) +{ +	struct qnx6_sb_info *sbi = QNX6_SB(s); +	struct buffer_head *bh; +	struct qnx6_super_block *sb; + +	/* Check the superblock signatures +	   start with the first superblock */ +	bh = sb_bread(s, offset); +	if (!bh) { +		printk(KERN_ERR "qnx6: unable to read the first superblock\n"); +		return NULL; +	} +	sb = (struct qnx6_super_block *)bh->b_data; +	if (fs32_to_cpu(sbi, sb->sb_magic) != QNX6_SUPER_MAGIC) { +		sbi->s_bytesex = BYTESEX_BE; +		if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) { +			/* we got a big endian fs */ +			QNX6DEBUG((KERN_INFO "qnx6: fs got different" +					" endianess.\n")); +			return bh; +		} else +			sbi->s_bytesex = BYTESEX_LE; +		if (!silent) { +			if (offset == 0) { +				printk(KERN_ERR "qnx6: wrong signature (magic)" +					" in superblock #1.\n"); +			} else { +				printk(KERN_INFO "qnx6: wrong signature (magic)" +					" at position (0x%lx) - will try" +					" alternative position (0x0000).\n", +						offset * s->s_blocksize); +			} +		} +		brelse(bh); +		return NULL; +	} +	return bh; +} + +static struct inode *qnx6_private_inode(struct super_block *s, +					struct qnx6_root_node *p); + +static int qnx6_fill_super(struct super_block *s, void *data, int silent) +{ +	struct buffer_head *bh1 = NULL, *bh2 = NULL; +	struct qnx6_super_block *sb1 = NULL, *sb2 = NULL; +	struct qnx6_sb_info *sbi; +	struct inode *root; +	const char *errmsg; +	struct qnx6_sb_info *qs; +	int ret = -EINVAL; +	u64 offset; +	int bootblock_offset = QNX6_BOOTBLOCK_SIZE; + +	qs = kzalloc(sizeof(struct qnx6_sb_info), GFP_KERNEL); +	if (!qs) +		return -ENOMEM; +	s->s_fs_info = qs; + +	/* Superblock always is 512 Byte long */ +	if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) { +		printk(KERN_ERR "qnx6: unable to set blocksize\n"); +		goto outnobh; +	} + +	/* parse the mount-options */ +	if (!qnx6_parse_options((char *) data, s)) { +		printk(KERN_ERR "qnx6: invalid mount options.\n"); +		goto outnobh; +	} +	if (test_opt(s, MMI_FS)) { +		sb1 = qnx6_mmi_fill_super(s, silent); +		if (sb1) +			goto mmi_success; +		else +			goto outnobh; +	} +	sbi = QNX6_SB(s); +	sbi->s_bytesex = BYTESEX_LE; +	/* Check the superblock signatures +	   start with the first superblock */ +	bh1 = qnx6_check_first_superblock(s, +		bootblock_offset / QNX6_SUPERBLOCK_SIZE, silent); +	if (!bh1) { +		/* try again without bootblock offset */ +		bh1 = qnx6_check_first_superblock(s, 0, silent); +		if (!bh1) { +			printk(KERN_ERR "qnx6: unable to read the first superblock\n"); +			goto outnobh; +		} +		/* seems that no bootblock at partition start */ +		bootblock_offset = 0; +	} +	sb1 = (struct qnx6_super_block *)bh1->b_data; + +#ifdef CONFIG_QNX6FS_DEBUG +	qnx6_superblock_debug(sb1, s); +#endif + +	/* checksum check - start at byte 8 and end at byte 512 */ +	if (fs32_to_cpu(sbi, sb1->sb_checksum) != +			crc32_be(0, (char *)(bh1->b_data + 8), 504)) { +		printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); +		goto out; +	} + +	/* set new blocksize */ +	if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { +		printk(KERN_ERR "qnx6: unable to set blocksize\n"); +		goto out; +	} +	/* blocksize invalidates bh - pull it back in */ +	brelse(bh1); +	bh1 = sb_bread(s, bootblock_offset >> s->s_blocksize_bits); +	if (!bh1) +		goto outnobh; +	sb1 = (struct qnx6_super_block *)bh1->b_data; + +	/* calculate second superblock blocknumber */ +	offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) + +		(bootblock_offset >> s->s_blocksize_bits) + +		(QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits); + +	/* set bootblock offset */ +	sbi->s_blks_off = (bootblock_offset >> s->s_blocksize_bits) + +			  (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits); + +	/* next the second superblock */ +	bh2 = sb_bread(s, offset); +	if (!bh2) { +		printk(KERN_ERR "qnx6: unable to read the second superblock\n"); +		goto out; +	} +	sb2 = (struct qnx6_super_block *)bh2->b_data; +	if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { +		if (!silent) +			printk(KERN_ERR "qnx6: wrong signature (magic)" +					" in superblock #2.\n"); +		goto out; +	} + +	/* checksum check - start at byte 8 and end at byte 512 */ +	if (fs32_to_cpu(sbi, sb2->sb_checksum) != +				crc32_be(0, (char *)(bh2->b_data + 8), 504)) { +		printk(KERN_ERR "qnx6: superblock #2 checksum error\n"); +		goto out; +	} + +	if (fs64_to_cpu(sbi, sb1->sb_serial) >= +					fs64_to_cpu(sbi, sb2->sb_serial)) { +		/* superblock #1 active */ +		sbi->sb_buf = bh1; +		sbi->sb = (struct qnx6_super_block *)bh1->b_data; +		brelse(bh2); +		printk(KERN_INFO "qnx6: superblock #1 active\n"); +	} else { +		/* superblock #2 active */ +		sbi->sb_buf = bh2; +		sbi->sb = (struct qnx6_super_block *)bh2->b_data; +		brelse(bh1); +		printk(KERN_INFO "qnx6: superblock #2 active\n"); +	} +mmi_success: +	/* sanity check - limit maximum indirect pointer levels */ +	if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) { +		printk(KERN_ERR "qnx6: too many inode levels (max %i, sb %i)\n", +			QNX6_PTR_MAX_LEVELS, sb1->Inode.levels); +		goto out; +	} +	if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) { +		printk(KERN_ERR "qnx6: too many longfilename levels" +				" (max %i, sb %i)\n", +			QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels); +		goto out; +	} +	s->s_op = &qnx6_sops; +	s->s_magic = QNX6_SUPER_MAGIC; +	s->s_flags |= MS_RDONLY;        /* Yup, read-only yet */ + +	/* ease the later tree level calculations */ +	sbi = QNX6_SB(s); +	sbi->s_ptrbits = ilog2(s->s_blocksize / 4); +	sbi->inodes = qnx6_private_inode(s, &sb1->Inode); +	if (!sbi->inodes) +		goto out; +	sbi->longfile = qnx6_private_inode(s, &sb1->Longfile); +	if (!sbi->longfile) +		goto out1; + +	/* prefetch root inode */ +	root = qnx6_iget(s, QNX6_ROOT_INO); +	if (IS_ERR(root)) { +		printk(KERN_ERR "qnx6: get inode failed\n"); +		ret = PTR_ERR(root); +		goto out2; +	} + +	ret = -ENOMEM; +	s->s_root = d_make_root(root); +	if (!s->s_root) +		goto out2; + +	ret = -EINVAL; +	errmsg = qnx6_checkroot(s); +	if (errmsg != NULL) { +		if (!silent) +			printk(KERN_ERR "qnx6: %s\n", errmsg); +		goto out3; +	} +	return 0; + +out3: +	dput(s->s_root); +	s->s_root = NULL; +out2: +	iput(sbi->longfile); +out1: +	iput(sbi->inodes); +out: +	if (bh1) +		brelse(bh1); +	if (bh2) +		brelse(bh2); +outnobh: +	kfree(qs); +	s->s_fs_info = NULL; +	return ret; +} + +static void qnx6_put_super(struct super_block *sb) +{ +	struct qnx6_sb_info *qs = QNX6_SB(sb); +	brelse(qs->sb_buf); +	iput(qs->longfile); +	iput(qs->inodes); +	kfree(qs); +	sb->s_fs_info = NULL; +	return; +} + +static sector_t qnx6_bmap(struct address_space *mapping, sector_t block) +{ +	return generic_block_bmap(mapping, block, qnx6_get_block); +} +static const struct address_space_operations qnx6_aops = { +	.readpage	= qnx6_readpage, +	.readpages	= qnx6_readpages, +	.bmap		= qnx6_bmap +}; + +static struct inode *qnx6_private_inode(struct super_block *s, +					struct qnx6_root_node *p) +{ +	struct inode *inode = new_inode(s); +	if (inode) { +		struct qnx6_inode_info *ei = QNX6_I(inode); +		struct qnx6_sb_info *sbi = QNX6_SB(s); +		inode->i_size = fs64_to_cpu(sbi, p->size); +		memcpy(ei->di_block_ptr, p->ptr, sizeof(p->ptr)); +		ei->di_filelevels = p->levels; +		inode->i_mode = S_IFREG | S_IRUSR; /* probably wrong */ +		inode->i_mapping->a_ops = &qnx6_aops; +	} +	return inode; +} + +struct inode *qnx6_iget(struct super_block *sb, unsigned ino) +{ +	struct qnx6_sb_info *sbi = QNX6_SB(sb); +	struct qnx6_inode_entry *raw_inode; +	struct inode *inode; +	struct qnx6_inode_info	*ei; +	struct address_space *mapping; +	struct page *page; +	u32 n, offs; + +	inode = iget_locked(sb, ino); +	if (!inode) +		return ERR_PTR(-ENOMEM); +	if (!(inode->i_state & I_NEW)) +		return inode; + +	ei = QNX6_I(inode); + +	inode->i_mode = 0; + +	if (ino == 0) { +		printk(KERN_ERR "qnx6: bad inode number on dev %s: %u is " +				"out of range\n", +		       sb->s_id, ino); +		iget_failed(inode); +		return ERR_PTR(-EIO); +	} +	n = (ino - 1) >> (PAGE_CACHE_SHIFT - QNX6_INODE_SIZE_BITS); +	offs = (ino - 1) & (~PAGE_CACHE_MASK >> QNX6_INODE_SIZE_BITS); +	mapping = sbi->inodes->i_mapping; +	page = read_mapping_page(mapping, n, NULL); +	if (IS_ERR(page)) { +		printk(KERN_ERR "qnx6: major problem: unable to read inode from " +		       "dev %s\n", sb->s_id); +		iget_failed(inode); +		return ERR_CAST(page); +	} +	kmap(page); +	raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs; + +	inode->i_mode    = fs16_to_cpu(sbi, raw_inode->di_mode); +	inode->i_uid     = (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid); +	inode->i_gid     = (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid); +	inode->i_size    = fs64_to_cpu(sbi, raw_inode->di_size); +	inode->i_mtime.tv_sec   = fs32_to_cpu(sbi, raw_inode->di_mtime); +	inode->i_mtime.tv_nsec = 0; +	inode->i_atime.tv_sec   = fs32_to_cpu(sbi, raw_inode->di_atime); +	inode->i_atime.tv_nsec = 0; +	inode->i_ctime.tv_sec   = fs32_to_cpu(sbi, raw_inode->di_ctime); +	inode->i_ctime.tv_nsec = 0; + +	/* calc blocks based on 512 byte blocksize */ +	inode->i_blocks = (inode->i_size + 511) >> 9; + +	memcpy(&ei->di_block_ptr, &raw_inode->di_block_ptr, +				sizeof(raw_inode->di_block_ptr)); +	ei->di_filelevels = raw_inode->di_filelevels; + +	if (S_ISREG(inode->i_mode)) { +		inode->i_fop = &generic_ro_fops; +		inode->i_mapping->a_ops = &qnx6_aops; +	} else if (S_ISDIR(inode->i_mode)) { +		inode->i_op = &qnx6_dir_inode_operations; +		inode->i_fop = &qnx6_dir_operations; +		inode->i_mapping->a_ops = &qnx6_aops; +	} else if (S_ISLNK(inode->i_mode)) { +		inode->i_op = &page_symlink_inode_operations; +		inode->i_mapping->a_ops = &qnx6_aops; +	} else +		init_special_inode(inode, inode->i_mode, 0); +	qnx6_put_page(page); +	unlock_new_inode(inode); +	return inode; +} + +static struct kmem_cache *qnx6_inode_cachep; + +static struct inode *qnx6_alloc_inode(struct super_block *sb) +{ +	struct qnx6_inode_info *ei; +	ei = kmem_cache_alloc(qnx6_inode_cachep, GFP_KERNEL); +	if (!ei) +		return NULL; +	return &ei->vfs_inode; +} + +static void qnx6_i_callback(struct rcu_head *head) +{ +	struct inode *inode = container_of(head, struct inode, i_rcu); +	INIT_LIST_HEAD(&inode->i_dentry); +	kmem_cache_free(qnx6_inode_cachep, QNX6_I(inode)); +} + +static void qnx6_destroy_inode(struct inode *inode) +{ +	call_rcu(&inode->i_rcu, qnx6_i_callback); +} + +static void init_once(void *foo) +{ +	struct qnx6_inode_info *ei = (struct qnx6_inode_info *) foo; + +	inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ +	qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", +					     sizeof(struct qnx6_inode_info), +					     0, (SLAB_RECLAIM_ACCOUNT| +						SLAB_MEM_SPREAD), +					     init_once); +	if (!qnx6_inode_cachep) +		return -ENOMEM; +	return 0; +} + +static void destroy_inodecache(void) +{ +	kmem_cache_destroy(qnx6_inode_cachep); +} + +static struct dentry *qnx6_mount(struct file_system_type *fs_type, +	int flags, const char *dev_name, void *data) +{ +	return mount_bdev(fs_type, flags, dev_name, data, qnx6_fill_super); +} + +static struct file_system_type qnx6_fs_type = { +	.owner		= THIS_MODULE, +	.name		= "qnx6", +	.mount		= qnx6_mount, +	.kill_sb	= kill_block_super, +	.fs_flags	= FS_REQUIRES_DEV, +}; + +static int __init init_qnx6_fs(void) +{ +	int err; + +	err = init_inodecache(); +	if (err) +		return err; + +	err = register_filesystem(&qnx6_fs_type); +	if (err) { +		destroy_inodecache(); +		return err; +	} + +	printk(KERN_INFO "QNX6 filesystem 1.0.0 registered.\n"); +	return 0; +} + +static void __exit exit_qnx6_fs(void) +{ +	unregister_filesystem(&qnx6_fs_type); +	destroy_inodecache(); +} + +module_init(init_qnx6_fs) +module_exit(exit_qnx6_fs) +MODULE_LICENSE("GPL"); diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c new file mode 100644 index 00000000000..8a97289e04a --- /dev/null +++ b/fs/qnx6/namei.c @@ -0,0 +1,42 @@ +/* + * QNX6 file system, Linux implementation. + * + * Version : 1.0.0 + * + * History : + * + * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. + * 16-02-2012 pagemap extension by Al Viro + * + */ + +#include "qnx6.h" + +struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, +				struct nameidata *nd) +{ +	unsigned ino; +	struct page *page; +	struct inode *foundinode = NULL; +	const char *name = dentry->d_name.name; +	int len = dentry->d_name.len; + +	if (len > QNX6_LONG_NAME_MAX) +		return ERR_PTR(-ENAMETOOLONG); + +	ino = qnx6_find_entry(len, dir, name, &page); +	if (ino) { +		foundinode = qnx6_iget(dir->i_sb, ino); +		qnx6_put_page(page); +		if (IS_ERR(foundinode)) { +			QNX6DEBUG((KERN_ERR "qnx6: lookup->iget -> " +				" error %ld\n", PTR_ERR(foundinode))); +			return ERR_CAST(foundinode); +		} +	} else { +		QNX6DEBUG((KERN_INFO "qnx6_lookup: not found %s\n", name)); +		return NULL; +	} +	d_add(dentry, foundinode); +	return NULL; +} diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h new file mode 100644 index 00000000000..6c5e02a0b6a --- /dev/null +++ b/fs/qnx6/qnx6.h @@ -0,0 +1,135 @@ +/* + * QNX6 file system, Linux implementation. + * + * Version : 1.0.0 + * + * History : + * + * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. + * 16-02-2012 page map extension by Al Viro + * + */ + +#include <linux/fs.h> +#include <linux/pagemap.h> + +typedef __u16 __bitwise __fs16; +typedef __u32 __bitwise __fs32; +typedef __u64 __bitwise __fs64; + +#include <linux/qnx6_fs.h> + +#ifdef CONFIG_QNX6FS_DEBUG +#define QNX6DEBUG(X) printk X +#else +#define QNX6DEBUG(X) (void) 0 +#endif + +struct qnx6_sb_info { +	struct buffer_head	*sb_buf;	/* superblock buffer */ +	struct qnx6_super_block	*sb;		/* our superblock */ +	int			s_blks_off;	/* blkoffset fs-startpoint */ +	int			s_ptrbits;	/* indirect pointer bitfield */ +	unsigned long		s_mount_opt;	/* all mount options */ +	int			s_bytesex;	/* holds endianess info */ +	struct inode *		inodes; +	struct inode *		longfile; +}; + +struct qnx6_inode_info { +	__fs32			di_block_ptr[QNX6_NO_DIRECT_POINTERS]; +	__u8			di_filelevels; +	__u32			i_dir_start_lookup; +	struct inode		vfs_inode; +}; + +extern struct inode *qnx6_iget(struct super_block *sb, unsigned ino); +extern struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, +					struct nameidata *nd); + +#ifdef CONFIG_QNX6FS_DEBUG +extern void qnx6_superblock_debug(struct qnx6_super_block *, +						struct super_block *); +#endif + +extern const struct inode_operations qnx6_dir_inode_operations; +extern const struct file_operations qnx6_dir_operations; + +static inline struct qnx6_sb_info *QNX6_SB(struct super_block *sb) +{ +	return sb->s_fs_info; +} + +static inline struct qnx6_inode_info *QNX6_I(struct inode *inode) +{ +	return container_of(inode, struct qnx6_inode_info, vfs_inode); +} + +#define clear_opt(o, opt)		(o &= ~(QNX6_MOUNT_##opt)) +#define set_opt(o, opt)			(o |= (QNX6_MOUNT_##opt)) +#define test_opt(sb, opt)		(QNX6_SB(sb)->s_mount_opt & \ +					 QNX6_MOUNT_##opt) +enum { +	BYTESEX_LE, +	BYTESEX_BE, +}; + +static inline __u64 fs64_to_cpu(struct qnx6_sb_info *sbi, __fs64 n) +{ +	if (sbi->s_bytesex == BYTESEX_LE) +		return le64_to_cpu((__force __le64)n); +	else +		return be64_to_cpu((__force __be64)n); +} + +static inline __fs64 cpu_to_fs64(struct qnx6_sb_info *sbi, __u64 n) +{ +	if (sbi->s_bytesex == BYTESEX_LE) +		return (__force __fs64)cpu_to_le64(n); +	else +		return (__force __fs64)cpu_to_be64(n); +} + +static inline __u32 fs32_to_cpu(struct qnx6_sb_info *sbi, __fs32 n) +{ +	if (sbi->s_bytesex == BYTESEX_LE) +		return le32_to_cpu((__force __le32)n); +	else +		return be32_to_cpu((__force __be32)n); +} + +static inline __fs32 cpu_to_fs32(struct qnx6_sb_info *sbi, __u32 n) +{ +	if (sbi->s_bytesex == BYTESEX_LE) +		return (__force __fs32)cpu_to_le32(n); +	else +		return (__force __fs32)cpu_to_be32(n); +} + +static inline __u16 fs16_to_cpu(struct qnx6_sb_info *sbi, __fs16 n) +{ +	if (sbi->s_bytesex == BYTESEX_LE) +		return le16_to_cpu((__force __le16)n); +	else +		return be16_to_cpu((__force __be16)n); +} + +static inline __fs16 cpu_to_fs16(struct qnx6_sb_info *sbi, __u16 n) +{ +	if (sbi->s_bytesex == BYTESEX_LE) +		return (__force __fs16)cpu_to_le16(n); +	else +		return (__force __fs16)cpu_to_be16(n); +} + +extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, +						    int silent); + +static inline void qnx6_put_page(struct page *page) +{ +	kunmap(page); +	page_cache_release(page); +} + +extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, +				struct page **res_page); diff --git a/fs/qnx6/super_mmi.c b/fs/qnx6/super_mmi.c new file mode 100644 index 00000000000..29c32cba62d --- /dev/null +++ b/fs/qnx6/super_mmi.c @@ -0,0 +1,150 @@ +/* + * QNX6 file system, Linux implementation. + * + * Version : 1.0.0 + * + * History : + * + * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. + * + */ + +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/crc32.h> +#include "qnx6.h" + +static void qnx6_mmi_copy_sb(struct qnx6_super_block *qsb, +		struct qnx6_mmi_super_block *sb) +{ +	qsb->sb_magic = sb->sb_magic; +	qsb->sb_checksum = sb->sb_checksum; +	qsb->sb_serial = sb->sb_serial; +	qsb->sb_blocksize = sb->sb_blocksize; +	qsb->sb_num_inodes = sb->sb_num_inodes; +	qsb->sb_free_inodes = sb->sb_free_inodes; +	qsb->sb_num_blocks = sb->sb_num_blocks; +	qsb->sb_free_blocks = sb->sb_free_blocks; + +	/* the rest of the superblock is the same */ +	memcpy(&qsb->Inode, &sb->Inode, sizeof(sb->Inode)); +	memcpy(&qsb->Bitmap, &sb->Bitmap, sizeof(sb->Bitmap)); +	memcpy(&qsb->Longfile, &sb->Longfile, sizeof(sb->Longfile)); +} + +struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) +{ +	struct buffer_head *bh1, *bh2 = NULL; +	struct qnx6_mmi_super_block *sb1, *sb2; +	struct qnx6_super_block *qsb = NULL; +	struct qnx6_sb_info *sbi; +	__u64 offset; + +	/* Check the superblock signatures +	   start with the first superblock */ +	bh1 = sb_bread(s, 0); +	if (!bh1) { +		printk(KERN_ERR "qnx6: Unable to read first mmi superblock\n"); +		return NULL; +	} +	sb1 = (struct qnx6_mmi_super_block *)bh1->b_data; +	sbi = QNX6_SB(s); +	if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) { +		if (!silent) { +			printk(KERN_ERR "qnx6: wrong signature (magic) in" +					" superblock #1.\n"); +			goto out; +		} +	} + +	/* checksum check - start at byte 8 and end at byte 512 */ +	if (fs32_to_cpu(sbi, sb1->sb_checksum) != +				crc32_be(0, (char *)(bh1->b_data + 8), 504)) { +		printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); +		goto out; +	} + +	/* calculate second superblock blocknumber */ +	offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) + QNX6_SUPERBLOCK_AREA / +					fs32_to_cpu(sbi, sb1->sb_blocksize); + +	/* set new blocksize */ +	if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { +		printk(KERN_ERR "qnx6: unable to set blocksize\n"); +		goto out; +	} +	/* blocksize invalidates bh - pull it back in */ +	brelse(bh1); +	bh1 = sb_bread(s, 0); +	if (!bh1) +		goto out; +	sb1 = (struct qnx6_mmi_super_block *)bh1->b_data; + +	/* read second superblock */ +	bh2 = sb_bread(s, offset); +	if (!bh2) { +		printk(KERN_ERR "qnx6: unable to read the second superblock\n"); +		goto out; +	} +	sb2 = (struct qnx6_mmi_super_block *)bh2->b_data; +	if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { +		if (!silent) +			printk(KERN_ERR "qnx6: wrong signature (magic) in" +					" superblock #2.\n"); +		goto out; +	} + +	/* checksum check - start at byte 8 and end at byte 512 */ +	if (fs32_to_cpu(sbi, sb2->sb_checksum) +			!= crc32_be(0, (char *)(bh2->b_data + 8), 504)) { +		printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); +		goto out; +	} + +	qsb = kmalloc(sizeof(*qsb), GFP_KERNEL); +	if (!qsb) { +		printk(KERN_ERR "qnx6: unable to allocate memory.\n"); +		goto out; +	} + +	if (fs64_to_cpu(sbi, sb1->sb_serial) > +					fs64_to_cpu(sbi, sb2->sb_serial)) { +		/* superblock #1 active */ +		qnx6_mmi_copy_sb(qsb, sb1); +#ifdef CONFIG_QNX6FS_DEBUG +		qnx6_superblock_debug(qsb, s); +#endif +		memcpy(bh1->b_data, qsb, sizeof(struct qnx6_super_block)); + +		sbi->sb_buf = bh1; +		sbi->sb = (struct qnx6_super_block *)bh1->b_data; +		brelse(bh2); +		printk(KERN_INFO "qnx6: superblock #1 active\n"); +	} else { +		/* superblock #2 active */ +		qnx6_mmi_copy_sb(qsb, sb2); +#ifdef CONFIG_QNX6FS_DEBUG +		qnx6_superblock_debug(qsb, s); +#endif +		memcpy(bh2->b_data, qsb, sizeof(struct qnx6_super_block)); + +		sbi->sb_buf = bh2; +		sbi->sb = (struct qnx6_super_block *)bh2->b_data; +		brelse(bh1); +		printk(KERN_INFO "qnx6: superblock #2 active\n"); +	} +	kfree(qsb); + +	/* offset for mmi_fs is just SUPERBLOCK_AREA bytes */ +	sbi->s_blks_off = QNX6_SUPERBLOCK_AREA / s->s_blocksize; + +	/* success */ +	return sbi->sb; + +out: +	if (bh1 != NULL) +		brelse(bh1); +	if (bh2 != NULL) +		brelse(bh2); +	return NULL; +} diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 5ec59b20cf7..d69a1d1d7e1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -71,6 +71,7 @@  #include <linux/module.h>  #include <linux/proc_fs.h>  #include <linux/security.h> +#include <linux/sched.h>  #include <linux/kmod.h>  #include <linux/namei.h>  #include <linux/capability.h> @@ -1109,6 +1110,13 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number)  	clear_bit(DQ_BLKS_B, &dquot->dq_flags);  } +struct dquot_warn { +	struct super_block *w_sb; +	qid_t w_dq_id; +	short w_dq_type; +	short w_type; +}; +  static int warning_issued(struct dquot *dquot, const int warntype)  {  	int flag = (warntype == QUOTA_NL_BHARDWARN || @@ -1124,41 +1132,42 @@ static int warning_issued(struct dquot *dquot, const int warntype)  #ifdef CONFIG_PRINT_QUOTA_WARNING  static int flag_print_warnings = 1; -static int need_print_warning(struct dquot *dquot) +static int need_print_warning(struct dquot_warn *warn)  {  	if (!flag_print_warnings)  		return 0; -	switch (dquot->dq_type) { +	switch (warn->w_dq_type) {  		case USRQUOTA: -			return current_fsuid() == dquot->dq_id; +			return current_fsuid() == warn->w_dq_id;  		case GRPQUOTA: -			return in_group_p(dquot->dq_id); +			return in_group_p(warn->w_dq_id);  	}  	return 0;  }  /* Print warning to user which exceeded quota */ -static void print_warning(struct dquot *dquot, const int warntype) +static void print_warning(struct dquot_warn *warn)  {  	char *msg = NULL;  	struct tty_struct *tty; +	int warntype = warn->w_type;  	if (warntype == QUOTA_NL_IHARDBELOW ||  	    warntype == QUOTA_NL_ISOFTBELOW ||  	    warntype == QUOTA_NL_BHARDBELOW || -	    warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot)) +	    warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn))  		return;  	tty = get_current_tty();  	if (!tty)  		return; -	tty_write_message(tty, dquot->dq_sb->s_id); +	tty_write_message(tty, warn->w_sb->s_id);  	if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)  		tty_write_message(tty, ": warning, ");  	else  		tty_write_message(tty, ": write failed, "); -	tty_write_message(tty, quotatypes[dquot->dq_type]); +	tty_write_message(tty, quotatypes[warn->w_dq_type]);  	switch (warntype) {  		case QUOTA_NL_IHARDWARN:  			msg = " file limit reached.\r\n"; @@ -1184,26 +1193,34 @@ static void print_warning(struct dquot *dquot, const int warntype)  }  #endif +static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot, +			    int warntype) +{ +	if (warning_issued(dquot, warntype)) +		return; +	warn->w_type = warntype; +	warn->w_sb = dquot->dq_sb; +	warn->w_dq_id = dquot->dq_id; +	warn->w_dq_type = dquot->dq_type; +} +  /*   * Write warnings to the console and send warning messages over netlink.   * - * Note that this function can sleep. + * Note that this function can call into tty and networking code.   */ -static void flush_warnings(struct dquot *const *dquots, char *warntype) +static void flush_warnings(struct dquot_warn *warn)  { -	struct dquot *dq;  	int i;  	for (i = 0; i < MAXQUOTAS; i++) { -		dq = dquots[i]; -		if (dq && warntype[i] != QUOTA_NL_NOWARN && -		    !warning_issued(dq, warntype[i])) { +		if (warn[i].w_type == QUOTA_NL_NOWARN) +			continue;  #ifdef CONFIG_PRINT_QUOTA_WARNING -			print_warning(dq, warntype[i]); +		print_warning(&warn[i]);  #endif -			quota_send_warning(dq->dq_type, dq->dq_id, -					   dq->dq_sb->s_dev, warntype[i]); -		} +		quota_send_warning(warn[i].w_dq_type, warn[i].w_dq_id, +				   warn[i].w_sb->s_dev, warn[i].w_type);  	}  } @@ -1217,11 +1234,11 @@ static int ignore_hardlimit(struct dquot *dquot)  }  /* needs dq_data_lock */ -static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) +static int check_idq(struct dquot *dquot, qsize_t inodes, +		     struct dquot_warn *warn)  {  	qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes; -	*warntype = QUOTA_NL_NOWARN;  	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||  	    test_bit(DQ_FAKE_B, &dquot->dq_flags))  		return 0; @@ -1229,7 +1246,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)  	if (dquot->dq_dqb.dqb_ihardlimit &&  	    newinodes > dquot->dq_dqb.dqb_ihardlimit &&              !ignore_hardlimit(dquot)) { -		*warntype = QUOTA_NL_IHARDWARN; +		prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);  		return -EDQUOT;  	} @@ -1238,14 +1255,14 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)  	    dquot->dq_dqb.dqb_itime &&  	    get_seconds() >= dquot->dq_dqb.dqb_itime &&              !ignore_hardlimit(dquot)) { -		*warntype = QUOTA_NL_ISOFTLONGWARN; +		prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);  		return -EDQUOT;  	}  	if (dquot->dq_dqb.dqb_isoftlimit &&  	    newinodes > dquot->dq_dqb.dqb_isoftlimit &&  	    dquot->dq_dqb.dqb_itime == 0) { -		*warntype = QUOTA_NL_ISOFTWARN; +		prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);  		dquot->dq_dqb.dqb_itime = get_seconds() +  		    sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;  	} @@ -1254,12 +1271,12 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)  }  /* needs dq_data_lock */ -static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) +static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, +		     struct dquot_warn *warn)  {  	qsize_t tspace;  	struct super_block *sb = dquot->dq_sb; -	*warntype = QUOTA_NL_NOWARN;  	if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||  	    test_bit(DQ_FAKE_B, &dquot->dq_flags))  		return 0; @@ -1271,7 +1288,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war  	    tspace > dquot->dq_dqb.dqb_bhardlimit &&              !ignore_hardlimit(dquot)) {  		if (!prealloc) -			*warntype = QUOTA_NL_BHARDWARN; +			prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);  		return -EDQUOT;  	} @@ -1281,7 +1298,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war  	    get_seconds() >= dquot->dq_dqb.dqb_btime &&              !ignore_hardlimit(dquot)) {  		if (!prealloc) -			*warntype = QUOTA_NL_BSOFTLONGWARN; +			prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);  		return -EDQUOT;  	} @@ -1289,7 +1306,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war  	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&  	    dquot->dq_dqb.dqb_btime == 0) {  		if (!prealloc) { -			*warntype = QUOTA_NL_BSOFTWARN; +			prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);  			dquot->dq_dqb.dqb_btime = get_seconds() +  			    sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace;  		} @@ -1542,10 +1559,9 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)  int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)  {  	int cnt, ret = 0; -	char warntype[MAXQUOTAS]; -	int warn = flags & DQUOT_SPACE_WARN; +	struct dquot_warn warn[MAXQUOTAS]; +	struct dquot **dquots = inode->i_dquot;  	int reserve = flags & DQUOT_SPACE_RESERVE; -	int nofail = flags & DQUOT_SPACE_NOFAIL;  	/*  	 * First test before acquiring mutex - solves deadlocks when we @@ -1558,36 +1574,36 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)  	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);  	for (cnt = 0; cnt < MAXQUOTAS; cnt++) -		warntype[cnt] = QUOTA_NL_NOWARN; +		warn[cnt].w_type = QUOTA_NL_NOWARN;  	spin_lock(&dq_data_lock);  	for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -		if (!inode->i_dquot[cnt]) +		if (!dquots[cnt])  			continue; -		ret = check_bdq(inode->i_dquot[cnt], number, !warn, -				warntype+cnt); -		if (ret && !nofail) { +		ret = check_bdq(dquots[cnt], number, +				!(flags & DQUOT_SPACE_WARN), &warn[cnt]); +		if (ret && !(flags & DQUOT_SPACE_NOFAIL)) {  			spin_unlock(&dq_data_lock);  			goto out_flush_warn;  		}  	}  	for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -		if (!inode->i_dquot[cnt]) +		if (!dquots[cnt])  			continue;  		if (reserve) -			dquot_resv_space(inode->i_dquot[cnt], number); +			dquot_resv_space(dquots[cnt], number);  		else -			dquot_incr_space(inode->i_dquot[cnt], number); +			dquot_incr_space(dquots[cnt], number);  	}  	inode_incr_space(inode, number, reserve);  	spin_unlock(&dq_data_lock);  	if (reserve)  		goto out_flush_warn; -	mark_all_dquot_dirty(inode->i_dquot); +	mark_all_dquot_dirty(dquots);  out_flush_warn: -	flush_warnings(inode->i_dquot, warntype);  	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); +	flush_warnings(warn);  out:  	return ret;  } @@ -1599,36 +1615,37 @@ EXPORT_SYMBOL(__dquot_alloc_space);  int dquot_alloc_inode(const struct inode *inode)  {  	int cnt, ret = 0; -	char warntype[MAXQUOTAS]; +	struct dquot_warn warn[MAXQUOTAS]; +	struct dquot * const *dquots = inode->i_dquot;  	/* First test before acquiring mutex - solves deadlocks when we           * re-enter the quota code and are already holding the mutex */  	if (!dquot_active(inode))  		return 0;  	for (cnt = 0; cnt < MAXQUOTAS; cnt++) -		warntype[cnt] = QUOTA_NL_NOWARN; +		warn[cnt].w_type = QUOTA_NL_NOWARN;  	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);  	spin_lock(&dq_data_lock);  	for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -		if (!inode->i_dquot[cnt]) +		if (!dquots[cnt])  			continue; -		ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt); +		ret = check_idq(dquots[cnt], 1, &warn[cnt]);  		if (ret)  			goto warn_put_all;  	}  	for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -		if (!inode->i_dquot[cnt]) +		if (!dquots[cnt])  			continue; -		dquot_incr_inodes(inode->i_dquot[cnt], 1); +		dquot_incr_inodes(dquots[cnt], 1);  	}  warn_put_all:  	spin_unlock(&dq_data_lock);  	if (ret == 0) -		mark_all_dquot_dirty(inode->i_dquot); -	flush_warnings(inode->i_dquot, warntype); +		mark_all_dquot_dirty(dquots);  	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); +	flush_warnings(warn);  	return ret;  }  EXPORT_SYMBOL(dquot_alloc_inode); @@ -1668,7 +1685,8 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);  void __dquot_free_space(struct inode *inode, qsize_t number, int flags)  {  	unsigned int cnt; -	char warntype[MAXQUOTAS]; +	struct dquot_warn warn[MAXQUOTAS]; +	struct dquot **dquots = inode->i_dquot;  	int reserve = flags & DQUOT_SPACE_RESERVE;  	/* First test before acquiring mutex - solves deadlocks when we @@ -1681,23 +1699,28 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)  	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);  	spin_lock(&dq_data_lock);  	for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -		if (!inode->i_dquot[cnt]) +		int wtype; + +		warn[cnt].w_type = QUOTA_NL_NOWARN; +		if (!dquots[cnt])  			continue; -		warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); +		wtype = info_bdq_free(dquots[cnt], number); +		if (wtype != QUOTA_NL_NOWARN) +			prepare_warning(&warn[cnt], dquots[cnt], wtype);  		if (reserve) -			dquot_free_reserved_space(inode->i_dquot[cnt], number); +			dquot_free_reserved_space(dquots[cnt], number);  		else -			dquot_decr_space(inode->i_dquot[cnt], number); +			dquot_decr_space(dquots[cnt], number);  	}  	inode_decr_space(inode, number, reserve);  	spin_unlock(&dq_data_lock);  	if (reserve)  		goto out_unlock; -	mark_all_dquot_dirty(inode->i_dquot); +	mark_all_dquot_dirty(dquots);  out_unlock: -	flush_warnings(inode->i_dquot, warntype);  	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); +	flush_warnings(warn);  }  EXPORT_SYMBOL(__dquot_free_space); @@ -1707,7 +1730,8 @@ EXPORT_SYMBOL(__dquot_free_space);  void dquot_free_inode(const struct inode *inode)  {  	unsigned int cnt; -	char warntype[MAXQUOTAS]; +	struct dquot_warn warn[MAXQUOTAS]; +	struct dquot * const *dquots = inode->i_dquot;  	/* First test before acquiring mutex - solves deadlocks when we           * re-enter the quota code and are already holding the mutex */ @@ -1717,15 +1741,20 @@ void dquot_free_inode(const struct inode *inode)  	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);  	spin_lock(&dq_data_lock);  	for (cnt = 0; cnt < MAXQUOTAS; cnt++) { -		if (!inode->i_dquot[cnt]) +		int wtype; + +		warn[cnt].w_type = QUOTA_NL_NOWARN; +		if (!dquots[cnt])  			continue; -		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1); -		dquot_decr_inodes(inode->i_dquot[cnt], 1); +		wtype = info_idq_free(dquots[cnt], 1); +		if (wtype != QUOTA_NL_NOWARN) +			prepare_warning(&warn[cnt], dquots[cnt], wtype); +		dquot_decr_inodes(dquots[cnt], 1);  	}  	spin_unlock(&dq_data_lock); -	mark_all_dquot_dirty(inode->i_dquot); -	flush_warnings(inode->i_dquot, warntype); +	mark_all_dquot_dirty(dquots);  	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); +	flush_warnings(warn);  }  EXPORT_SYMBOL(dquot_free_inode); @@ -1746,16 +1775,20 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)  	struct dquot *transfer_from[MAXQUOTAS] = {};  	int cnt, ret = 0;  	char is_valid[MAXQUOTAS] = {}; -	char warntype_to[MAXQUOTAS]; -	char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; +	struct dquot_warn warn_to[MAXQUOTAS]; +	struct dquot_warn warn_from_inodes[MAXQUOTAS]; +	struct dquot_warn warn_from_space[MAXQUOTAS];  	/* First test before acquiring mutex - solves deadlocks when we           * re-enter the quota code and are already holding the mutex */  	if (IS_NOQUOTA(inode))  		return 0;  	/* Initialize the arrays */ -	for (cnt = 0; cnt < MAXQUOTAS; cnt++) -		warntype_to[cnt] = QUOTA_NL_NOWARN; +	for (cnt = 0; cnt < MAXQUOTAS; cnt++) { +		warn_to[cnt].w_type = QUOTA_NL_NOWARN; +		warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN; +		warn_from_space[cnt].w_type = QUOTA_NL_NOWARN; +	}  	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);  	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */  		up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1777,10 +1810,10 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)  			continue;  		is_valid[cnt] = 1;  		transfer_from[cnt] = inode->i_dquot[cnt]; -		ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); +		ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]);  		if (ret)  			goto over_quota; -		ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt); +		ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);  		if (ret)  			goto over_quota;  	} @@ -1793,10 +1826,15 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)  			continue;  		/* Due to IO error we might not have transfer_from[] structure */  		if (transfer_from[cnt]) { -			warntype_from_inodes[cnt] = -				info_idq_free(transfer_from[cnt], 1); -			warntype_from_space[cnt] = -				info_bdq_free(transfer_from[cnt], space); +			int wtype; +			wtype = info_idq_free(transfer_from[cnt], 1); +			if (wtype != QUOTA_NL_NOWARN) +				prepare_warning(&warn_from_inodes[cnt], +						transfer_from[cnt], wtype); +			wtype = info_bdq_free(transfer_from[cnt], space); +			if (wtype != QUOTA_NL_NOWARN) +				prepare_warning(&warn_from_space[cnt], +						transfer_from[cnt], wtype);  			dquot_decr_inodes(transfer_from[cnt], 1);  			dquot_decr_space(transfer_from[cnt], cur_space);  			dquot_free_reserved_space(transfer_from[cnt], @@ -1814,9 +1852,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)  	mark_all_dquot_dirty(transfer_from);  	mark_all_dquot_dirty(transfer_to); -	flush_warnings(transfer_to, warntype_to); -	flush_warnings(transfer_from, warntype_from_inodes); -	flush_warnings(transfer_from, warntype_from_space); +	flush_warnings(warn_to); +	flush_warnings(warn_from_inodes); +	flush_warnings(warn_from_space);  	/* Pass back references to put */  	for (cnt = 0; cnt < MAXQUOTAS; cnt++)  		if (is_valid[cnt]) @@ -1825,7 +1863,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)  over_quota:  	spin_unlock(&dq_data_lock);  	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); -	flush_warnings(transfer_to, warntype_to); +	flush_warnings(warn_to);  	return ret;  }  EXPORT_SYMBOL(__dquot_transfer); @@ -2125,6 +2163,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,  		mutex_unlock(&dqopt->dqio_mutex);  		goto out_file_init;  	} +	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) +		dqopt->info[type].dqi_flags |= DQF_SYS_FILE;  	mutex_unlock(&dqopt->dqio_mutex);  	spin_lock(&dq_state_lock);  	dqopt->flags |= dquot_state_flag(flags, type); @@ -2464,7 +2504,7 @@ int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)  	spin_lock(&dq_data_lock);  	ii->dqi_bgrace = mi->dqi_bgrace;  	ii->dqi_igrace = mi->dqi_igrace; -	ii->dqi_flags = mi->dqi_flags & DQF_MASK; +	ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK;  	ii->dqi_valid = IIF_ALL;  	spin_unlock(&dq_data_lock);  	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); @@ -2490,8 +2530,8 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)  	if (ii->dqi_valid & IIF_IGRACE)  		mi->dqi_igrace = ii->dqi_igrace;  	if (ii->dqi_valid & IIF_FLAGS) -		mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | -				(ii->dqi_flags & DQF_MASK); +		mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) | +				(ii->dqi_flags & DQF_SETINFO_MASK);  	spin_unlock(&dq_data_lock);  	mark_info_dirty(sb, type);  	/* Force write to disk */ diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 7a9bedeb1d5..9a391204ca2 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -291,11 +291,26 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,  	}  } +/* Return 1 if 'cmd' will block on frozen filesystem */ +static int quotactl_cmd_write(int cmd) +{ +	switch (cmd) { +	case Q_GETFMT: +	case Q_GETINFO: +	case Q_SYNC: +	case Q_XGETQSTAT: +	case Q_XGETQUOTA: +	case Q_XQUOTASYNC: +		return 0; +	} +	return 1; +} +  /*   * look up a superblock on which quota ops will be performed   * - use the name of a block device to find the superblock thereon   */ -static struct super_block *quotactl_block(const char __user *special) +static struct super_block *quotactl_block(const char __user *special, int cmd)  {  #ifdef CONFIG_BLOCK  	struct block_device *bdev; @@ -308,7 +323,10 @@ static struct super_block *quotactl_block(const char __user *special)  	putname(tmp);  	if (IS_ERR(bdev))  		return ERR_CAST(bdev); -	sb = get_super(bdev); +	if (quotactl_cmd_write(cmd)) +		sb = get_super_thawed(bdev); +	else +		sb = get_super(bdev);  	bdput(bdev);  	if (!sb)  		return ERR_PTR(-ENODEV); @@ -360,7 +378,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,  			pathp = &path;  	} -	sb = quotactl_block(special); +	sb = quotactl_block(special, cmds);  	if (IS_ERR(sb)) {  		ret = PTR_ERR(sb);  		goto out; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index aec766abe3a..a1fdabe21de 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -209,22 +209,19 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)  int ramfs_fill_super(struct super_block *sb, void *data, int silent)  {  	struct ramfs_fs_info *fsi; -	struct inode *inode = NULL; -	struct dentry *root; +	struct inode *inode;  	int err;  	save_mount_options(sb, data);  	fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);  	sb->s_fs_info = fsi; -	if (!fsi) { -		err = -ENOMEM; -		goto fail; -	} +	if (!fsi) +		return -ENOMEM;  	err = ramfs_parse_options(data, &fsi->mount_opts);  	if (err) -		goto fail; +		return err;  	sb->s_maxbytes		= MAX_LFS_FILESIZE;  	sb->s_blocksize		= PAGE_CACHE_SIZE; @@ -234,24 +231,11 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent)  	sb->s_time_gran		= 1;  	inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); -	if (!inode) { -		err = -ENOMEM; -		goto fail; -	} - -	root = d_alloc_root(inode); -	sb->s_root = root; -	if (!root) { -		err = -ENOMEM; -		goto fail; -	} +	sb->s_root = d_make_root(inode); +	if (!sb->s_root) +		return -ENOMEM;  	return 0; -fail: -	kfree(fsi); -	sb->s_fs_info = NULL; -	iput(inode); -	return err;  }  struct dentry *ramfs_mount(struct file_system_type *fs_type, diff --git a/fs/read_write.c b/fs/read_write.c index 5ad4248b0cd..ffc99d22e0a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -11,7 +11,7 @@  #include <linux/uio.h>  #include <linux/fsnotify.h>  #include <linux/security.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/syscalls.h>  #include <linux/pagemap.h>  #include <linux/splice.h> diff --git a/fs/readdir.c b/fs/readdir.c index 356f71528ad..cc0a8227cdd 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -6,7 +6,7 @@  #include <linux/stddef.h>  #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/time.h>  #include <linux/mm.h>  #include <linux/errno.h> diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h new file mode 100644 index 00000000000..f096b80e73d --- /dev/null +++ b/fs/reiserfs/acl.h @@ -0,0 +1,76 @@ +#include <linux/init.h> +#include <linux/posix_acl.h> + +#define REISERFS_ACL_VERSION	0x0001 + +typedef struct { +	__le16 e_tag; +	__le16 e_perm; +	__le32 e_id; +} reiserfs_acl_entry; + +typedef struct { +	__le16 e_tag; +	__le16 e_perm; +} reiserfs_acl_entry_short; + +typedef struct { +	__le32 a_version; +} reiserfs_acl_header; + +static inline size_t reiserfs_acl_size(int count) +{ +	if (count <= 4) { +		return sizeof(reiserfs_acl_header) + +		    count * sizeof(reiserfs_acl_entry_short); +	} else { +		return sizeof(reiserfs_acl_header) + +		    4 * sizeof(reiserfs_acl_entry_short) + +		    (count - 4) * sizeof(reiserfs_acl_entry); +	} +} + +static inline int reiserfs_acl_count(size_t size) +{ +	ssize_t s; +	size -= sizeof(reiserfs_acl_header); +	s = size - 4 * sizeof(reiserfs_acl_entry_short); +	if (s < 0) { +		if (size % sizeof(reiserfs_acl_entry_short)) +			return -1; +		return size / sizeof(reiserfs_acl_entry_short); +	} else { +		if (s % sizeof(reiserfs_acl_entry)) +			return -1; +		return s / sizeof(reiserfs_acl_entry) + 4; +	} +} + +#ifdef CONFIG_REISERFS_FS_POSIX_ACL +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type); +int reiserfs_acl_chmod(struct inode *inode); +int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, +				 struct inode *dir, struct dentry *dentry, +				 struct inode *inode); +int reiserfs_cache_default_acl(struct inode *dir); +extern const struct xattr_handler reiserfs_posix_acl_default_handler; +extern const struct xattr_handler reiserfs_posix_acl_access_handler; + +#else + +#define reiserfs_cache_default_acl(inode) 0 +#define reiserfs_get_acl NULL + +static inline int reiserfs_acl_chmod(struct inode *inode) +{ +	return 0; +} + +static inline int +reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, +			     const struct inode *dir, struct dentry *dentry, +			     struct inode *inode) +{ +	return 0; +} +#endif diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 70de42f09f1..4c0c7d163d1 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -4,14 +4,12 @@  /* Reiserfs block (de)allocator, bitmap-based. */  #include <linux/time.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/errno.h>  #include <linux/buffer_head.h>  #include <linux/kernel.h>  #include <linux/pagemap.h>  #include <linux/vmalloc.h> -#include <linux/reiserfs_fs_sb.h> -#include <linux/reiserfs_fs_i.h>  #include <linux/quotaops.h>  #include <linux/seq_file.h> diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 133e9355dc6..66c53b642a8 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -5,7 +5,7 @@  #include <linux/string.h>  #include <linux/errno.h>  #include <linux/fs.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/stat.h>  #include <linux/buffer_head.h>  #include <linux/slab.h> diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 60c08044066..2b7882b508d 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -17,7 +17,7 @@  #include <asm/uaccess.h>  #include <linux/time.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/buffer_head.h>  #include <linux/kernel.h> diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index ace635053a3..8375c922c0d 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -3,9 +3,9 @@   */  #include <linux/time.h> -#include <linux/reiserfs_fs.h> -#include <linux/reiserfs_acl.h> -#include <linux/reiserfs_xattr.h> +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h"  #include <asm/uaccess.h>  #include <linux/pagemap.h>  #include <linux/swap.h> diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 1e4250bc3a6..430e0658704 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -37,7 +37,7 @@  #include <linux/time.h>  #include <linux/slab.h>  #include <linux/string.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/buffer_head.h>  /* To make any changes in the tree we find a node, that contains item diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c index 6471c670743..91b0cc1242a 100644 --- a/fs/reiserfs/hashes.c +++ b/fs/reiserfs/hashes.c @@ -19,7 +19,7 @@  //  #include <linux/kernel.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <asm/types.h>  #define DELTA 0x9E3779B9 diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c index 2074fd95046..e1978fd895f 100644 --- a/fs/reiserfs/ibalance.c +++ b/fs/reiserfs/ibalance.c @@ -5,7 +5,7 @@  #include <asm/uaccess.h>  #include <linux/string.h>  #include <linux/time.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/buffer_head.h>  /* this is one and only function that is used outside (do_balance.c) */ diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 9e8cd5acd79..494c315c741 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -4,9 +4,9 @@  #include <linux/time.h>  #include <linux/fs.h> -#include <linux/reiserfs_fs.h> -#include <linux/reiserfs_acl.h> -#include <linux/reiserfs_xattr.h> +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h"  #include <linux/exportfs.h>  #include <linux/pagemap.h>  #include <linux/highmem.h> diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 950e3d1b5c9..0c2185042d5 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -5,7 +5,7 @@  #include <linux/capability.h>  #include <linux/fs.h>  #include <linux/mount.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/time.h>  #include <asm/uaccess.h>  #include <linux/pagemap.h> diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index 72cb1cc51b8..ee382ef3d30 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -3,7 +3,7 @@   */  #include <linux/time.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  // this contains item handlers for old item types: sd, direct,  // indirect, directory diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index c3cf54fd4de..cf9f4de00a9 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -37,7 +37,7 @@  #include <linux/time.h>  #include <linux/semaphore.h>  #include <linux/vmalloc.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/kernel.h>  #include <linux/errno.h>  #include <linux/fcntl.h> diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index 03d85cbf90b..79e5a8b4c22 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -5,7 +5,7 @@  #include <asm/uaccess.h>  #include <linux/string.h>  #include <linux/time.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/buffer_head.h>  /* these are used in do_balance.c */ @@ -975,7 +975,7 @@ static int leaf_cut_entries(struct buffer_head *bh,  	   remove */  	RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");  	RFALSE(I_ENTRY_COUNT(ih) < from + del_count, -	       "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d", +	       "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d",  	       I_ENTRY_COUNT(ih), from, del_count);  	if (del_count == 0) diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c index 7df1ce48203..d735bc8470e 100644 --- a/fs/reiserfs/lock.c +++ b/fs/reiserfs/lock.c @@ -1,4 +1,4 @@ -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/mutex.h>  /* diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 14637886523..84e8a69cee9 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -14,9 +14,9 @@  #include <linux/time.h>  #include <linux/bitops.h>  #include <linux/slab.h> -#include <linux/reiserfs_fs.h> -#include <linux/reiserfs_acl.h> -#include <linux/reiserfs_xattr.h> +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h"  #include <linux/quotaops.h>  #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); } diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 3a6de810bd6..f732d6a5251 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -5,8 +5,7 @@  #include <linux/string.h>  #include <linux/random.h>  #include <linux/time.h> -#include <linux/reiserfs_fs.h> -#include <linux/reiserfs_fs_sb.h> +#include "reiserfs.h"  // find where objectid map starts  #define objectid_map(s,rs) (old_format_only (s) ? \ diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 45de98b5946..c0b1112ab7e 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -4,7 +4,7 @@  #include <linux/time.h>  #include <linux/fs.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/string.h>  #include <linux/buffer_head.h> @@ -329,7 +329,7 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)      Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it      pointless complexity): -    panics in reiserfs_fs.h have numbers from 1000 to 1999 +    panics in reiserfs.h have numbers from 1000 to 1999      super.c				        2000 to 2999      preserve.c (unused)			    3000 to 3999      bitmap.c				    4000 to 4999 diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 7a9981196c1..2c1ade692cc 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -12,8 +12,7 @@  #include <linux/time.h>  #include <linux/seq_file.h>  #include <asm/uaccess.h> -#include <linux/reiserfs_fs.h> -#include <linux/reiserfs_fs_sb.h> +#include "reiserfs.h"  #include <linux/init.h>  #include <linux/proc_fs.h> diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h new file mode 100644 index 00000000000..a59d2712633 --- /dev/null +++ b/fs/reiserfs/reiserfs.h @@ -0,0 +1,2923 @@ +/* + * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for licensing and copyright details + */ + +#include <linux/reiserfs_fs.h> + +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/bug.h> +#include <linux/workqueue.h> +#include <asm/unaligned.h> +#include <linux/bitops.h> +#include <linux/proc_fs.h> +#include <linux/buffer_head.h> + +/* the 32 bit compat definitions with int argument */ +#define REISERFS_IOC32_UNPACK		_IOW(0xCD, 1, int) +#define REISERFS_IOC32_GETFLAGS		FS_IOC32_GETFLAGS +#define REISERFS_IOC32_SETFLAGS		FS_IOC32_SETFLAGS +#define REISERFS_IOC32_GETVERSION	FS_IOC32_GETVERSION +#define REISERFS_IOC32_SETVERSION	FS_IOC32_SETVERSION + +struct reiserfs_journal_list; + +/** bitmasks for i_flags field in reiserfs-specific part of inode */ +typedef enum { +    /** this says what format of key do all items (but stat data) of +      an object have.  If this is set, that format is 3.6 otherwise +      - 3.5 */ +	i_item_key_version_mask = 0x0001, +    /** If this is unset, object has 3.5 stat data, otherwise, it has +      3.6 stat data with 64bit size, 32bit nlink etc. */ +	i_stat_data_version_mask = 0x0002, +    /** file might need tail packing on close */ +	i_pack_on_close_mask = 0x0004, +    /** don't pack tail of file */ +	i_nopack_mask = 0x0008, +    /** If those is set, "safe link" was created for this file during +      truncate or unlink. Safe link is used to avoid leakage of disk +      space on crash with some files open, but unlinked. */ +	i_link_saved_unlink_mask = 0x0010, +	i_link_saved_truncate_mask = 0x0020, +	i_has_xattr_dir = 0x0040, +	i_data_log = 0x0080, +} reiserfs_inode_flags; + +struct reiserfs_inode_info { +	__u32 i_key[4];		/* key is still 4 32 bit integers */ +    /** transient inode flags that are never stored on disk. Bitmasks +      for this field are defined above. */ +	__u32 i_flags; + +	__u32 i_first_direct_byte;	// offset of first byte stored in direct item. + +	/* copy of persistent inode flags read from sd_attrs. */ +	__u32 i_attrs; + +	int i_prealloc_block;	/* first unused block of a sequence of unused blocks */ +	int i_prealloc_count;	/* length of that sequence */ +	struct list_head i_prealloc_list;	/* per-transaction list of inodes which +						 * have preallocated blocks */ + +	unsigned new_packing_locality:1;	/* new_packig_locality is created; new blocks +						 * for the contents of this directory should be +						 * displaced */ + +	/* we use these for fsync or O_SYNC to decide which transaction +	 ** needs to be committed in order for this inode to be properly +	 ** flushed */ +	unsigned int i_trans_id; +	struct reiserfs_journal_list *i_jl; +	atomic_t openers; +	struct mutex tailpack; +#ifdef CONFIG_REISERFS_FS_XATTR +	struct rw_semaphore i_xattr_sem; +#endif +	struct inode vfs_inode; +}; + +typedef enum { +	reiserfs_attrs_cleared = 0x00000001, +} reiserfs_super_block_flags; + +/* struct reiserfs_super_block accessors/mutators + * since this is a disk structure, it will always be in + * little endian format. */ +#define sb_block_count(sbp)         (le32_to_cpu((sbp)->s_v1.s_block_count)) +#define set_sb_block_count(sbp,v)   ((sbp)->s_v1.s_block_count = cpu_to_le32(v)) +#define sb_free_blocks(sbp)         (le32_to_cpu((sbp)->s_v1.s_free_blocks)) +#define set_sb_free_blocks(sbp,v)   ((sbp)->s_v1.s_free_blocks = cpu_to_le32(v)) +#define sb_root_block(sbp)          (le32_to_cpu((sbp)->s_v1.s_root_block)) +#define set_sb_root_block(sbp,v)    ((sbp)->s_v1.s_root_block = cpu_to_le32(v)) + +#define sb_jp_journal_1st_block(sbp)  \ +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_1st_block)) +#define set_sb_jp_journal_1st_block(sbp,v) \ +              ((sbp)->s_v1.s_journal.jp_journal_1st_block = cpu_to_le32(v)) +#define sb_jp_journal_dev(sbp) \ +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_dev)) +#define set_sb_jp_journal_dev(sbp,v) \ +              ((sbp)->s_v1.s_journal.jp_journal_dev = cpu_to_le32(v)) +#define sb_jp_journal_size(sbp) \ +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_size)) +#define set_sb_jp_journal_size(sbp,v) \ +              ((sbp)->s_v1.s_journal.jp_journal_size = cpu_to_le32(v)) +#define sb_jp_journal_trans_max(sbp) \ +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_trans_max)) +#define set_sb_jp_journal_trans_max(sbp,v) \ +              ((sbp)->s_v1.s_journal.jp_journal_trans_max = cpu_to_le32(v)) +#define sb_jp_journal_magic(sbp) \ +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_magic)) +#define set_sb_jp_journal_magic(sbp,v) \ +              ((sbp)->s_v1.s_journal.jp_journal_magic = cpu_to_le32(v)) +#define sb_jp_journal_max_batch(sbp) \ +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_batch)) +#define set_sb_jp_journal_max_batch(sbp,v) \ +              ((sbp)->s_v1.s_journal.jp_journal_max_batch = cpu_to_le32(v)) +#define sb_jp_jourmal_max_commit_age(sbp) \ +              (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_commit_age)) +#define set_sb_jp_journal_max_commit_age(sbp,v) \ +              ((sbp)->s_v1.s_journal.jp_journal_max_commit_age = cpu_to_le32(v)) + +#define sb_blocksize(sbp)          (le16_to_cpu((sbp)->s_v1.s_blocksize)) +#define set_sb_blocksize(sbp,v)    ((sbp)->s_v1.s_blocksize = cpu_to_le16(v)) +#define sb_oid_maxsize(sbp)        (le16_to_cpu((sbp)->s_v1.s_oid_maxsize)) +#define set_sb_oid_maxsize(sbp,v)  ((sbp)->s_v1.s_oid_maxsize = cpu_to_le16(v)) +#define sb_oid_cursize(sbp)        (le16_to_cpu((sbp)->s_v1.s_oid_cursize)) +#define set_sb_oid_cursize(sbp,v)  ((sbp)->s_v1.s_oid_cursize = cpu_to_le16(v)) +#define sb_umount_state(sbp)       (le16_to_cpu((sbp)->s_v1.s_umount_state)) +#define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v)) +#define sb_fs_state(sbp)           (le16_to_cpu((sbp)->s_v1.s_fs_state)) +#define set_sb_fs_state(sbp,v)     ((sbp)->s_v1.s_fs_state = cpu_to_le16(v)) +#define sb_hash_function_code(sbp) \ +              (le32_to_cpu((sbp)->s_v1.s_hash_function_code)) +#define set_sb_hash_function_code(sbp,v) \ +              ((sbp)->s_v1.s_hash_function_code = cpu_to_le32(v)) +#define sb_tree_height(sbp)        (le16_to_cpu((sbp)->s_v1.s_tree_height)) +#define set_sb_tree_height(sbp,v)  ((sbp)->s_v1.s_tree_height = cpu_to_le16(v)) +#define sb_bmap_nr(sbp)            (le16_to_cpu((sbp)->s_v1.s_bmap_nr)) +#define set_sb_bmap_nr(sbp,v)      ((sbp)->s_v1.s_bmap_nr = cpu_to_le16(v)) +#define sb_version(sbp)            (le16_to_cpu((sbp)->s_v1.s_version)) +#define set_sb_version(sbp,v)      ((sbp)->s_v1.s_version = cpu_to_le16(v)) + +#define sb_mnt_count(sbp)	   (le16_to_cpu((sbp)->s_mnt_count)) +#define set_sb_mnt_count(sbp, v)   ((sbp)->s_mnt_count = cpu_to_le16(v)) + +#define sb_reserved_for_journal(sbp) \ +              (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal)) +#define set_sb_reserved_for_journal(sbp,v) \ +              ((sbp)->s_v1.s_reserved_for_journal = cpu_to_le16(v)) + +/* LOGGING -- */ + +/* These all interelate for performance. +** +** If the journal block count is smaller than n transactions, you lose speed. +** I don't know what n is yet, I'm guessing 8-16. +** +** typical transaction size depends on the application, how often fsync is +** called, and how many metadata blocks you dirty in a 30 second period. +** The more small files (<16k) you use, the larger your transactions will +** be. +** +** If your journal fills faster than dirty buffers get flushed to disk, it must flush them before allowing the journal +** to wrap, which slows things down.  If you need high speed meta data updates, the journal should be big enough +** to prevent wrapping before dirty meta blocks get to disk. +** +** If the batch max is smaller than the transaction max, you'll waste space at the end of the journal +** because journal_end sets the next transaction to start at 0 if the next transaction has any chance of wrapping. +** +** The large the batch max age, the better the speed, and the more meta data changes you'll lose after a crash. +** +*/ + +/* don't mess with these for a while */ +				/* we have a node size define somewhere in reiserfs_fs.h. -Hans */ +#define JOURNAL_BLOCK_SIZE  4096	/* BUG gotta get rid of this */ +#define JOURNAL_MAX_CNODE   1500	/* max cnodes to allocate. */ +#define JOURNAL_HASH_SIZE 8192 +#define JOURNAL_NUM_BITMAPS 5	/* number of copies of the bitmaps to have floating.  Must be >= 2 */ + +/* One of these for every block in every transaction +** Each one is in two hash tables.  First, a hash of the current transaction, and after journal_end, a +** hash of all the in memory transactions. +** next and prev are used by the current transaction (journal_hash). +** hnext and hprev are used by journal_list_hash.  If a block is in more than one transaction, the journal_list_hash +** links it in multiple times.  This allows flush_journal_list to remove just the cnode belonging +** to a given transaction. +*/ +struct reiserfs_journal_cnode { +	struct buffer_head *bh;	/* real buffer head */ +	struct super_block *sb;	/* dev of real buffer head */ +	__u32 blocknr;		/* block number of real buffer head, == 0 when buffer on disk */ +	unsigned long state; +	struct reiserfs_journal_list *jlist;	/* journal list this cnode lives in */ +	struct reiserfs_journal_cnode *next;	/* next in transaction list */ +	struct reiserfs_journal_cnode *prev;	/* prev in transaction list */ +	struct reiserfs_journal_cnode *hprev;	/* prev in hash list */ +	struct reiserfs_journal_cnode *hnext;	/* next in hash list */ +}; + +struct reiserfs_bitmap_node { +	int id; +	char *data; +	struct list_head list; +}; + +struct reiserfs_list_bitmap { +	struct reiserfs_journal_list *journal_list; +	struct reiserfs_bitmap_node **bitmaps; +}; + +/* +** one of these for each transaction.  The most important part here is the j_realblock. +** this list of cnodes is used to hash all the blocks in all the commits, to mark all the +** real buffer heads dirty once all the commits hit the disk, +** and to make sure every real block in a transaction is on disk before allowing the log area +** to be overwritten */ +struct reiserfs_journal_list { +	unsigned long j_start; +	unsigned long j_state; +	unsigned long j_len; +	atomic_t j_nonzerolen; +	atomic_t j_commit_left; +	atomic_t j_older_commits_done;	/* all commits older than this on disk */ +	struct mutex j_commit_mutex; +	unsigned int j_trans_id; +	time_t j_timestamp; +	struct reiserfs_list_bitmap *j_list_bitmap; +	struct buffer_head *j_commit_bh;	/* commit buffer head */ +	struct reiserfs_journal_cnode *j_realblock; +	struct reiserfs_journal_cnode *j_freedlist;	/* list of buffers that were freed during this trans.  free each of these on flush */ +	/* time ordered list of all active transactions */ +	struct list_head j_list; + +	/* time ordered list of all transactions we haven't tried to flush yet */ +	struct list_head j_working_list; + +	/* list of tail conversion targets in need of flush before commit */ +	struct list_head j_tail_bh_list; +	/* list of data=ordered buffers in need of flush before commit */ +	struct list_head j_bh_list; +	int j_refcount; +}; + +struct reiserfs_journal { +	struct buffer_head **j_ap_blocks;	/* journal blocks on disk */ +	struct reiserfs_journal_cnode *j_last;	/* newest journal block */ +	struct reiserfs_journal_cnode *j_first;	/*  oldest journal block.  start here for traverse */ + +	struct block_device *j_dev_bd; +	fmode_t j_dev_mode; +	int j_1st_reserved_block;	/* first block on s_dev of reserved area journal */ + +	unsigned long j_state; +	unsigned int j_trans_id; +	unsigned long j_mount_id; +	unsigned long j_start;	/* start of current waiting commit (index into j_ap_blocks) */ +	unsigned long j_len;	/* length of current waiting commit */ +	unsigned long j_len_alloc;	/* number of buffers requested by journal_begin() */ +	atomic_t j_wcount;	/* count of writers for current commit */ +	unsigned long j_bcount;	/* batch count. allows turning X transactions into 1 */ +	unsigned long j_first_unflushed_offset;	/* first unflushed transactions offset */ +	unsigned j_last_flush_trans_id;	/* last fully flushed journal timestamp */ +	struct buffer_head *j_header_bh; + +	time_t j_trans_start_time;	/* time this transaction started */ +	struct mutex j_mutex; +	struct mutex j_flush_mutex; +	wait_queue_head_t j_join_wait;	/* wait for current transaction to finish before starting new one */ +	atomic_t j_jlock;	/* lock for j_join_wait */ +	int j_list_bitmap_index;	/* number of next list bitmap to use */ +	int j_must_wait;	/* no more journal begins allowed. MUST sleep on j_join_wait */ +	int j_next_full_flush;	/* next journal_end will flush all journal list */ +	int j_next_async_flush;	/* next journal_end will flush all async commits */ + +	int j_cnode_used;	/* number of cnodes on the used list */ +	int j_cnode_free;	/* number of cnodes on the free list */ + +	unsigned int j_trans_max;	/* max number of blocks in a transaction.  */ +	unsigned int j_max_batch;	/* max number of blocks to batch into a trans */ +	unsigned int j_max_commit_age;	/* in seconds, how old can an async commit be */ +	unsigned int j_max_trans_age;	/* in seconds, how old can a transaction be */ +	unsigned int j_default_max_commit_age;	/* the default for the max commit age */ + +	struct reiserfs_journal_cnode *j_cnode_free_list; +	struct reiserfs_journal_cnode *j_cnode_free_orig;	/* orig pointer returned from vmalloc */ + +	struct reiserfs_journal_list *j_current_jl; +	int j_free_bitmap_nodes; +	int j_used_bitmap_nodes; + +	int j_num_lists;	/* total number of active transactions */ +	int j_num_work_lists;	/* number that need attention from kreiserfsd */ + +	/* debugging to make sure things are flushed in order */ +	unsigned int j_last_flush_id; + +	/* debugging to make sure things are committed in order */ +	unsigned int j_last_commit_id; + +	struct list_head j_bitmap_nodes; +	struct list_head j_dirty_buffers; +	spinlock_t j_dirty_buffers_lock;	/* protects j_dirty_buffers */ + +	/* list of all active transactions */ +	struct list_head j_journal_list; +	/* lists that haven't been touched by writeback attempts */ +	struct list_head j_working_list; + +	struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS];	/* array of bitmaps to record the deleted blocks */ +	struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE];	/* hash table for real buffer heads in current trans */ +	struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];	/* hash table for all the real buffer heads in all +										   the transactions */ +	struct list_head j_prealloc_list;	/* list of inodes which have preallocated blocks */ +	int j_persistent_trans; +	unsigned long j_max_trans_size; +	unsigned long j_max_batch_size; + +	int j_errno; + +	/* when flushing ordered buffers, throttle new ordered writers */ +	struct delayed_work j_work; +	struct super_block *j_work_sb; +	atomic_t j_async_throttle; +}; + +enum journal_state_bits { +	J_WRITERS_BLOCKED = 1,	/* set when new writers not allowed */ +	J_WRITERS_QUEUED,	/* set when log is full due to too many writers */ +	J_ABORTED,		/* set when log is aborted */ +}; + +#define JOURNAL_DESC_MAGIC "ReIsErLB"	/* ick.  magic string to find desc blocks in the journal */ + +typedef __u32(*hashf_t) (const signed char *, int); + +struct reiserfs_bitmap_info { +	__u32 free_count; +}; + +struct proc_dir_entry; + +#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO ) +typedef unsigned long int stat_cnt_t; +typedef struct reiserfs_proc_info_data { +	spinlock_t lock; +	int exiting; +	int max_hash_collisions; + +	stat_cnt_t breads; +	stat_cnt_t bread_miss; +	stat_cnt_t search_by_key; +	stat_cnt_t search_by_key_fs_changed; +	stat_cnt_t search_by_key_restarted; + +	stat_cnt_t insert_item_restarted; +	stat_cnt_t paste_into_item_restarted; +	stat_cnt_t cut_from_item_restarted; +	stat_cnt_t delete_solid_item_restarted; +	stat_cnt_t delete_item_restarted; + +	stat_cnt_t leaked_oid; +	stat_cnt_t leaves_removable; + +	/* balances per level. Use explicit 5 as MAX_HEIGHT is not visible yet. */ +	stat_cnt_t balance_at[5];	/* XXX */ +	/* sbk == search_by_key */ +	stat_cnt_t sbk_read_at[5];	/* XXX */ +	stat_cnt_t sbk_fs_changed[5]; +	stat_cnt_t sbk_restarted[5]; +	stat_cnt_t items_at[5];	/* XXX */ +	stat_cnt_t free_at[5];	/* XXX */ +	stat_cnt_t can_node_be_removed[5];	/* XXX */ +	long int lnum[5];	/* XXX */ +	long int rnum[5];	/* XXX */ +	long int lbytes[5];	/* XXX */ +	long int rbytes[5];	/* XXX */ +	stat_cnt_t get_neighbors[5]; +	stat_cnt_t get_neighbors_restart[5]; +	stat_cnt_t need_l_neighbor[5]; +	stat_cnt_t need_r_neighbor[5]; + +	stat_cnt_t free_block; +	struct __scan_bitmap_stats { +		stat_cnt_t call; +		stat_cnt_t wait; +		stat_cnt_t bmap; +		stat_cnt_t retry; +		stat_cnt_t in_journal_hint; +		stat_cnt_t in_journal_nohint; +		stat_cnt_t stolen; +	} scan_bitmap; +	struct __journal_stats { +		stat_cnt_t in_journal; +		stat_cnt_t in_journal_bitmap; +		stat_cnt_t in_journal_reusable; +		stat_cnt_t lock_journal; +		stat_cnt_t lock_journal_wait; +		stat_cnt_t journal_being; +		stat_cnt_t journal_relock_writers; +		stat_cnt_t journal_relock_wcount; +		stat_cnt_t mark_dirty; +		stat_cnt_t mark_dirty_already; +		stat_cnt_t mark_dirty_notjournal; +		stat_cnt_t restore_prepared; +		stat_cnt_t prepare; +		stat_cnt_t prepare_retry; +	} journal; +} reiserfs_proc_info_data_t; +#else +typedef struct reiserfs_proc_info_data { +} reiserfs_proc_info_data_t; +#endif + +/* reiserfs union of in-core super block data */ +struct reiserfs_sb_info { +	struct buffer_head *s_sbh;	/* Buffer containing the super block */ +	/* both the comment and the choice of +	   name are unclear for s_rs -Hans */ +	struct reiserfs_super_block *s_rs;	/* Pointer to the super block in the buffer */ +	struct reiserfs_bitmap_info *s_ap_bitmap; +	struct reiserfs_journal *s_journal;	/* pointer to journal information */ +	unsigned short s_mount_state;	/* reiserfs state (valid, invalid) */ + +	/* Serialize writers access, replace the old bkl */ +	struct mutex lock; +	/* Owner of the lock (can be recursive) */ +	struct task_struct *lock_owner; +	/* Depth of the lock, start from -1 like the bkl */ +	int lock_depth; + +	/* Comment? -Hans */ +	void (*end_io_handler) (struct buffer_head *, int); +	hashf_t s_hash_function;	/* pointer to function which is used +					   to sort names in directory. Set on +					   mount */ +	unsigned long s_mount_opt;	/* reiserfs's mount options are set +					   here (currently - NOTAIL, NOLOG, +					   REPLAYONLY) */ + +	struct {		/* This is a structure that describes block allocator options */ +		unsigned long bits;	/* Bitfield for enable/disable kind of options */ +		unsigned long large_file_size;	/* size started from which we consider file to be a large one(in blocks) */ +		int border;	/* percentage of disk, border takes */ +		int preallocmin;	/* Minimal file size (in blocks) starting from which we do preallocations */ +		int preallocsize;	/* Number of blocks we try to prealloc when file +					   reaches preallocmin size (in blocks) or +					   prealloc_list is empty. */ +	} s_alloc_options; + +	/* Comment? -Hans */ +	wait_queue_head_t s_wait; +	/* To be obsoleted soon by per buffer seals.. -Hans */ +	atomic_t s_generation_counter;	// increased by one every time the +	// tree gets re-balanced +	unsigned long s_properties;	/* File system properties. Currently holds +					   on-disk FS format */ + +	/* session statistics */ +	int s_disk_reads; +	int s_disk_writes; +	int s_fix_nodes; +	int s_do_balance; +	int s_unneeded_left_neighbor; +	int s_good_search_by_key_reada; +	int s_bmaps; +	int s_bmaps_without_search; +	int s_direct2indirect; +	int s_indirect2direct; +	/* set up when it's ok for reiserfs_read_inode2() to read from +	   disk inode with nlink==0. Currently this is only used during +	   finish_unfinished() processing at mount time */ +	int s_is_unlinked_ok; +	reiserfs_proc_info_data_t s_proc_info_data; +	struct proc_dir_entry *procdir; +	int reserved_blocks;	/* amount of blocks reserved for further allocations */ +	spinlock_t bitmap_lock;	/* this lock on now only used to protect reserved_blocks variable */ +	struct dentry *priv_root;	/* root of /.reiserfs_priv */ +	struct dentry *xattr_root;	/* root of /.reiserfs_priv/xattrs */ +	int j_errno; +#ifdef CONFIG_QUOTA +	char *s_qf_names[MAXQUOTAS]; +	int s_jquota_fmt; +#endif +	char *s_jdev;		/* Stored jdev for mount option showing */ +#ifdef CONFIG_REISERFS_CHECK + +	struct tree_balance *cur_tb;	/* +					 * Detects whether more than one +					 * copy of tb exists per superblock +					 * as a means of checking whether +					 * do_balance is executing concurrently +					 * against another tree reader/writer +					 * on a same mount point. +					 */ +#endif +}; + +/* Definitions of reiserfs on-disk properties: */ +#define REISERFS_3_5 0 +#define REISERFS_3_6 1 +#define REISERFS_OLD_FORMAT 2 + +enum reiserfs_mount_options { +/* Mount options */ +	REISERFS_LARGETAIL,	/* large tails will be created in a session */ +	REISERFS_SMALLTAIL,	/* small (for files less than block size) tails will be created in a session */ +	REPLAYONLY,		/* replay journal and return 0. Use by fsck */ +	REISERFS_CONVERT,	/* -o conv: causes conversion of old +				   format super block to the new +				   format. If not specified - old +				   partition will be dealt with in a +				   manner of 3.5.x */ + +/* -o hash={tea, rupasov, r5, detect} is meant for properly mounting +** reiserfs disks from 3.5.19 or earlier.  99% of the time, this option +** is not required.  If the normal autodection code can't determine which +** hash to use (because both hashes had the same value for a file) +** use this option to force a specific hash.  It won't allow you to override +** the existing hash on the FS, so if you have a tea hash disk, and mount +** with -o hash=rupasov, the mount will fail. +*/ +	FORCE_TEA_HASH,		/* try to force tea hash on mount */ +	FORCE_RUPASOV_HASH,	/* try to force rupasov hash on mount */ +	FORCE_R5_HASH,		/* try to force rupasov hash on mount */ +	FORCE_HASH_DETECT,	/* try to detect hash function on mount */ + +	REISERFS_DATA_LOG, +	REISERFS_DATA_ORDERED, +	REISERFS_DATA_WRITEBACK, + +/* used for testing experimental features, makes benchmarking new +   features with and without more convenient, should never be used by +   users in any code shipped to users (ideally) */ + +	REISERFS_NO_BORDER, +	REISERFS_NO_UNHASHED_RELOCATION, +	REISERFS_HASHED_RELOCATION, +	REISERFS_ATTRS, +	REISERFS_XATTRS_USER, +	REISERFS_POSIXACL, +	REISERFS_EXPOSE_PRIVROOT, +	REISERFS_BARRIER_NONE, +	REISERFS_BARRIER_FLUSH, + +	/* Actions on error */ +	REISERFS_ERROR_PANIC, +	REISERFS_ERROR_RO, +	REISERFS_ERROR_CONTINUE, + +	REISERFS_USRQUOTA,	/* User quota option specified */ +	REISERFS_GRPQUOTA,	/* Group quota option specified */ + +	REISERFS_TEST1, +	REISERFS_TEST2, +	REISERFS_TEST3, +	REISERFS_TEST4, +	REISERFS_UNSUPPORTED_OPT, +}; + +#define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH)) +#define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH)) +#define reiserfs_tea_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_TEA_HASH)) +#define reiserfs_hash_detect(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_HASH_DETECT)) +#define reiserfs_no_border(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_BORDER)) +#define reiserfs_no_unhashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION)) +#define reiserfs_hashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_HASHED_RELOCATION)) +#define reiserfs_test4(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TEST4)) + +#define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL)) +#define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL)) +#define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY)) +#define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS)) +#define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5)) +#define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT)) +#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG)) +#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED)) +#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK)) +#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER)) +#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL)) +#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT)) +#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s)) +#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE)) +#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH)) + +#define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC)) +#define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO)) + +void reiserfs_file_buffer(struct buffer_head *bh, int list); +extern struct file_system_type reiserfs_fs_type; +int reiserfs_resize(struct super_block *, unsigned long); + +#define CARRY_ON                0 +#define SCHEDULE_OCCURRED       1 + +#define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh) +#define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal) +#define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block) +#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) +#define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap) + +#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->) + +/* A safe version of the "bdevname", which returns the "s_id" field of + * a superblock or else "Null superblock" if the super block is NULL. + */ +static inline char *reiserfs_bdevname(struct super_block *s) +{ +	return (s == NULL) ? "Null superblock" : s->s_id; +} + +#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal))) +static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal +						*journal) +{ +	return test_bit(J_ABORTED, &journal->j_state); +} + +/* + * Locking primitives. The write lock is a per superblock + * special mutex that has properties close to the Big Kernel Lock + * which was used in the previous locking scheme. + */ +void reiserfs_write_lock(struct super_block *s); +void reiserfs_write_unlock(struct super_block *s); +int reiserfs_write_lock_once(struct super_block *s); +void reiserfs_write_unlock_once(struct super_block *s, int lock_depth); + +#ifdef CONFIG_REISERFS_CHECK +void reiserfs_lock_check_recursive(struct super_block *s); +#else +static inline void reiserfs_lock_check_recursive(struct super_block *s) { } +#endif + +/* + * Several mutexes depend on the write lock. + * However sometimes we want to relax the write lock while we hold + * these mutexes, according to the release/reacquire on schedule() + * properties of the Bkl that were used. + * Reiserfs performances and locking were based on this scheme. + * Now that the write lock is a mutex and not the bkl anymore, doing so + * may result in a deadlock: + * + * A acquire write_lock + * A acquire j_commit_mutex + * A release write_lock and wait for something + * B acquire write_lock + * B can't acquire j_commit_mutex and sleep + * A can't acquire write lock anymore + * deadlock + * + * What we do here is avoiding such deadlock by playing the same game + * than the Bkl: if we can't acquire a mutex that depends on the write lock, + * we release the write lock, wait a bit and then retry. + * + * The mutexes concerned by this hack are: + * - The commit mutex of a journal list + * - The flush mutex + * - The journal lock + * - The inode mutex + */ +static inline void reiserfs_mutex_lock_safe(struct mutex *m, +			       struct super_block *s) +{ +	reiserfs_lock_check_recursive(s); +	reiserfs_write_unlock(s); +	mutex_lock(m); +	reiserfs_write_lock(s); +} + +static inline void +reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass, +			       struct super_block *s) +{ +	reiserfs_lock_check_recursive(s); +	reiserfs_write_unlock(s); +	mutex_lock_nested(m, subclass); +	reiserfs_write_lock(s); +} + +static inline void +reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s) +{ +	reiserfs_lock_check_recursive(s); +	reiserfs_write_unlock(s); +	down_read(sem); +	reiserfs_write_lock(s); +} + +/* + * When we schedule, we usually want to also release the write lock, + * according to the previous bkl based locking scheme of reiserfs. + */ +static inline void reiserfs_cond_resched(struct super_block *s) +{ +	if (need_resched()) { +		reiserfs_write_unlock(s); +		schedule(); +		reiserfs_write_lock(s); +	} +} + +struct fid; + +/* in reading the #defines, it may help to understand that they employ +   the following abbreviations: + +   B = Buffer +   I = Item header +   H = Height within the tree (should be changed to LEV) +   N = Number of the item in the node +   STAT = stat data +   DEH = Directory Entry Header +   EC = Entry Count +   E = Entry number +   UL = Unsigned Long +   BLKH = BLocK Header +   UNFM = UNForMatted node +   DC = Disk Child +   P = Path + +   These #defines are named by concatenating these abbreviations, +   where first comes the arguments, and last comes the return value, +   of the macro. + +*/ + +#define USE_INODE_GENERATION_COUNTER + +#define REISERFS_PREALLOCATE +#define DISPLACE_NEW_PACKING_LOCALITIES +#define PREALLOCATION_SIZE 9 + +/* n must be power of 2 */ +#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u)) + +// to be ok for alpha and others we have to align structures to 8 byte +// boundary. +// FIXME: do not change 4 by anything else: there is code which relies on that +#define ROUND_UP(x) _ROUND_UP(x,8LL) + +/* debug levels.  Right now, CONFIG_REISERFS_CHECK means print all debug +** messages. +*/ +#define REISERFS_DEBUG_CODE 5	/* extra messages to help find/debug errors */ + +void __reiserfs_warning(struct super_block *s, const char *id, +			 const char *func, const char *fmt, ...); +#define reiserfs_warning(s, id, fmt, args...) \ +	 __reiserfs_warning(s, id, __func__, fmt, ##args) +/* assertions handling */ + +/** always check a condition and panic if it's false. */ +#define __RASSERT(cond, scond, format, args...)			\ +do {									\ +	if (!(cond))							\ +		reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \ +			       __FILE__ ":%i:%s: " format "\n",		\ +			       in_interrupt() ? -1 : task_pid_nr(current), \ +			       __LINE__, __func__ , ##args);		\ +} while (0) + +#define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args) + +#if defined( CONFIG_REISERFS_CHECK ) +#define RFALSE(cond, format, args...) __RASSERT(!(cond), "!(" #cond ")", format, ##args) +#else +#define RFALSE( cond, format, args... ) do {;} while( 0 ) +#endif + +#define CONSTF __attribute_const__ +/* + * Disk Data Structures + */ + +/***************************************************************************/ +/*                             SUPER BLOCK                                 */ +/***************************************************************************/ + +/* + * Structure of super block on disk, a version of which in RAM is often accessed as REISERFS_SB(s)->s_rs + * the version in RAM is part of a larger structure containing fields never written to disk. + */ +#define UNSET_HASH 0		// read_super will guess about, what hash names +		     // in directories were sorted with +#define TEA_HASH  1 +#define YURA_HASH 2 +#define R5_HASH   3 +#define DEFAULT_HASH R5_HASH + +struct journal_params { +	__le32 jp_journal_1st_block;	/* where does journal start from on its +					 * device */ +	__le32 jp_journal_dev;	/* journal device st_rdev */ +	__le32 jp_journal_size;	/* size of the journal */ +	__le32 jp_journal_trans_max;	/* max number of blocks in a transaction. */ +	__le32 jp_journal_magic;	/* random value made on fs creation (this +					 * was sb_journal_block_count) */ +	__le32 jp_journal_max_batch;	/* max number of blocks to batch into a +					 * trans */ +	__le32 jp_journal_max_commit_age;	/* in seconds, how old can an async +						 * commit be */ +	__le32 jp_journal_max_trans_age;	/* in seconds, how old can a transaction +						 * be */ +}; + +/* this is the super from 3.5.X, where X >= 10 */ +struct reiserfs_super_block_v1 { +	__le32 s_block_count;	/* blocks count         */ +	__le32 s_free_blocks;	/* free blocks count    */ +	__le32 s_root_block;	/* root block number    */ +	struct journal_params s_journal; +	__le16 s_blocksize;	/* block size */ +	__le16 s_oid_maxsize;	/* max size of object id array, see +				 * get_objectid() commentary  */ +	__le16 s_oid_cursize;	/* current size of object id array */ +	__le16 s_umount_state;	/* this is set to 1 when filesystem was +				 * umounted, to 2 - when not */ +	char s_magic[10];	/* reiserfs magic string indicates that +				 * file system is reiserfs: +				 * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" */ +	__le16 s_fs_state;	/* it is set to used by fsck to mark which +				 * phase of rebuilding is done */ +	__le32 s_hash_function_code;	/* indicate, what hash function is being use +					 * to sort names in a directory*/ +	__le16 s_tree_height;	/* height of disk tree */ +	__le16 s_bmap_nr;	/* amount of bitmap blocks needed to address +				 * each block of file system */ +	__le16 s_version;	/* this field is only reliable on filesystem +				 * with non-standard journal */ +	__le16 s_reserved_for_journal;	/* size in blocks of journal area on main +					 * device, we need to keep after +					 * making fs with non-standard journal */ +} __attribute__ ((__packed__)); + +#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1)) + +/* this is the on disk super block */ +struct reiserfs_super_block { +	struct reiserfs_super_block_v1 s_v1; +	__le32 s_inode_generation; +	__le32 s_flags;		/* Right now used only by inode-attributes, if enabled */ +	unsigned char s_uuid[16];	/* filesystem unique identifier */ +	unsigned char s_label[16];	/* filesystem volume label */ +	__le16 s_mnt_count;		/* Count of mounts since last fsck */ +	__le16 s_max_mnt_count;		/* Maximum mounts before check */ +	__le32 s_lastcheck;		/* Timestamp of last fsck */ +	__le32 s_check_interval;	/* Interval between checks */ +	char s_unused[76];	/* zero filled by mkreiserfs and +				 * reiserfs_convert_objectid_map_v1() +				 * so any additions must be updated +				 * there as well. */ +} __attribute__ ((__packed__)); + +#define SB_SIZE (sizeof(struct reiserfs_super_block)) + +#define REISERFS_VERSION_1 0 +#define REISERFS_VERSION_2 2 + +// on-disk super block fields converted to cpu form +#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs) +#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1)) +#define SB_BLOCKSIZE(s) \ +        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_blocksize)) +#define SB_BLOCK_COUNT(s) \ +        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_block_count)) +#define SB_FREE_BLOCKS(s) \ +        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks)) +#define SB_REISERFS_MAGIC(s) \ +        (SB_V1_DISK_SUPER_BLOCK(s)->s_magic) +#define SB_ROOT_BLOCK(s) \ +        le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_root_block)) +#define SB_TREE_HEIGHT(s) \ +        le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height)) +#define SB_REISERFS_STATE(s) \ +        le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state)) +#define SB_VERSION(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_version)) +#define SB_BMAP_NR(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr)) + +#define PUT_SB_BLOCK_COUNT(s, val) \ +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0) +#define PUT_SB_FREE_BLOCKS(s, val) \ +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0) +#define PUT_SB_ROOT_BLOCK(s, val) \ +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0) +#define PUT_SB_TREE_HEIGHT(s, val) \ +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0) +#define PUT_SB_REISERFS_STATE(s, val) \ +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0) +#define PUT_SB_VERSION(s, val) \ +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0) +#define PUT_SB_BMAP_NR(s, val) \ +   do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0) + +#define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal) +#define SB_ONDISK_JOURNAL_SIZE(s) \ +         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size)) +#define SB_ONDISK_JOURNAL_1st_BLOCK(s) \ +         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_1st_block)) +#define SB_ONDISK_JOURNAL_DEVICE(s) \ +         le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_dev)) +#define SB_ONDISK_RESERVED_FOR_JOURNAL(s) \ +         le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_reserved_for_journal)) + +#define is_block_in_log_or_reserved_area(s, block) \ +         block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \ +         && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) +  \ +         ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \ +         SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s))) + +int is_reiserfs_3_5(struct reiserfs_super_block *rs); +int is_reiserfs_3_6(struct reiserfs_super_block *rs); +int is_reiserfs_jr(struct reiserfs_super_block *rs); + +/* ReiserFS leaves the first 64k unused, so that partition labels have +   enough space.  If someone wants to write a fancy bootloader that +   needs more than 64k, let us know, and this will be increased in size. +   This number must be larger than than the largest block size on any +   platform, or code will break.  -Hans */ +#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024) +#define REISERFS_FIRST_BLOCK unused_define +#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES + +/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */ +#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024) + +/* reiserfs internal error code (used by search_by_key and fix_nodes)) */ +#define CARRY_ON      0 +#define REPEAT_SEARCH -1 +#define IO_ERROR      -2 +#define NO_DISK_SPACE -3 +#define NO_BALANCING_NEEDED  (-4) +#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5) +#define QUOTA_EXCEEDED -6 + +typedef __u32 b_blocknr_t; +typedef __le32 unp_t; + +struct unfm_nodeinfo { +	unp_t unfm_nodenum; +	unsigned short unfm_freespace; +}; + +/* there are two formats of keys: 3.5 and 3.6 + */ +#define KEY_FORMAT_3_5 0 +#define KEY_FORMAT_3_6 1 + +/* there are two stat datas */ +#define STAT_DATA_V1 0 +#define STAT_DATA_V2 1 + +static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode) +{ +	return container_of(inode, struct reiserfs_inode_info, vfs_inode); +} + +static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb) +{ +	return sb->s_fs_info; +} + +/* Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16 + * which overflows on large file systems. */ +static inline __u32 reiserfs_bmap_count(struct super_block *sb) +{ +	return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1; +} + +static inline int bmap_would_wrap(unsigned bmap_nr) +{ +	return bmap_nr > ((1LL << 16) - 1); +} + +/** this says about version of key of all items (but stat data) the +    object consists of */ +#define get_inode_item_key_version( inode )                                    \ +    ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5) + +#define set_inode_item_key_version( inode, version )                           \ +         ({ if((version)==KEY_FORMAT_3_6)                                      \ +                REISERFS_I(inode)->i_flags |= i_item_key_version_mask;      \ +            else                                                               \ +                REISERFS_I(inode)->i_flags &= ~i_item_key_version_mask; }) + +#define get_inode_sd_version(inode)                                            \ +    ((REISERFS_I(inode)->i_flags & i_stat_data_version_mask) ? STAT_DATA_V2 : STAT_DATA_V1) + +#define set_inode_sd_version(inode, version)                                   \ +         ({ if((version)==STAT_DATA_V2)                                        \ +                REISERFS_I(inode)->i_flags |= i_stat_data_version_mask;     \ +            else                                                               \ +                REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; }) + +/* This is an aggressive tail suppression policy, I am hoping it +   improves our benchmarks. The principle behind it is that percentage +   space saving is what matters, not absolute space saving.  This is +   non-intuitive, but it helps to understand it if you consider that the +   cost to access 4 blocks is not much more than the cost to access 1 +   block, if you have to do a seek and rotate.  A tail risks a +   non-linear disk access that is significant as a percentage of total +   time cost for a 4 block file and saves an amount of space that is +   less significant as a percentage of space, or so goes the hypothesis. +   -Hans */ +#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \ +(\ +  (!(n_tail_size)) || \ +  (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \ +   ( (n_file_size) >= (n_block_size) * 4 ) || \ +   ( ( (n_file_size) >= (n_block_size) * 3 ) && \ +     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \ +   ( ( (n_file_size) >= (n_block_size) * 2 ) && \ +     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \ +   ( ( (n_file_size) >= (n_block_size) ) && \ +     ( (n_tail_size) >=   (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \ +) + +/* Another strategy for tails, this one means only create a tail if all the +   file would fit into one DIRECT item. +   Primary intention for this one is to increase performance by decreasing +   seeking. +*/ +#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \ +(\ +  (!(n_tail_size)) || \ +  (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \ +) + +/* + * values for s_umount_state field + */ +#define REISERFS_VALID_FS    1 +#define REISERFS_ERROR_FS    2 + +// +// there are 5 item types currently +// +#define TYPE_STAT_DATA 0 +#define TYPE_INDIRECT 1 +#define TYPE_DIRECT 2 +#define TYPE_DIRENTRY 3 +#define TYPE_MAXTYPE 3 +#define TYPE_ANY 15		// FIXME: comment is required + +/***************************************************************************/ +/*                       KEY & ITEM HEAD                                   */ +/***************************************************************************/ + +// +// directories use this key as well as old files +// +struct offset_v1 { +	__le32 k_offset; +	__le32 k_uniqueness; +} __attribute__ ((__packed__)); + +struct offset_v2 { +	__le64 v; +} __attribute__ ((__packed__)); + +static inline __u16 offset_v2_k_type(const struct offset_v2 *v2) +{ +	__u8 type = le64_to_cpu(v2->v) >> 60; +	return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY; +} + +static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type) +{ +	v2->v = +	    (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60); +} + +static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2) +{ +	return le64_to_cpu(v2->v) & (~0ULL >> 4); +} + +static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset) +{ +	offset &= (~0ULL >> 4); +	v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset); +} + +/* Key of an item determines its location in the S+tree, and +   is composed of 4 components */ +struct reiserfs_key { +	__le32 k_dir_id;	/* packing locality: by default parent +				   directory object id */ +	__le32 k_objectid;	/* object identifier */ +	union { +		struct offset_v1 k_offset_v1; +		struct offset_v2 k_offset_v2; +	} __attribute__ ((__packed__)) u; +} __attribute__ ((__packed__)); + +struct in_core_key { +	__u32 k_dir_id;		/* packing locality: by default parent +				   directory object id */ +	__u32 k_objectid;	/* object identifier */ +	__u64 k_offset; +	__u8 k_type; +}; + +struct cpu_key { +	struct in_core_key on_disk_key; +	int version; +	int key_length;		/* 3 in all cases but direct2indirect and +				   indirect2direct conversion */ +}; + +/* Our function for comparing keys can compare keys of different +   lengths.  It takes as a parameter the length of the keys it is to +   compare.  These defines are used in determining what is to be passed +   to it as that parameter. */ +#define REISERFS_FULL_KEY_LEN     4 +#define REISERFS_SHORT_KEY_LEN    2 + +/* The result of the key compare */ +#define FIRST_GREATER 1 +#define SECOND_GREATER -1 +#define KEYS_IDENTICAL 0 +#define KEY_FOUND 1 +#define KEY_NOT_FOUND 0 + +#define KEY_SIZE (sizeof(struct reiserfs_key)) +#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32)) + +/* return values for search_by_key and clones */ +#define ITEM_FOUND 1 +#define ITEM_NOT_FOUND 0 +#define ENTRY_FOUND 1 +#define ENTRY_NOT_FOUND 0 +#define DIRECTORY_NOT_FOUND -1 +#define REGULAR_FILE_FOUND -2 +#define DIRECTORY_FOUND -3 +#define BYTE_FOUND 1 +#define BYTE_NOT_FOUND 0 +#define FILE_NOT_FOUND -1 + +#define POSITION_FOUND 1 +#define POSITION_NOT_FOUND 0 + +// return values for reiserfs_find_entry and search_by_entry_key +#define NAME_FOUND 1 +#define NAME_NOT_FOUND 0 +#define GOTO_PREVIOUS_ITEM 2 +#define NAME_FOUND_INVISIBLE 3 + +/*  Everything in the filesystem is stored as a set of items.  The +    item head contains the key of the item, its free space (for +    indirect items) and specifies the location of the item itself +    within the block.  */ + +struct item_head { +	/* Everything in the tree is found by searching for it based on +	 * its key.*/ +	struct reiserfs_key ih_key; +	union { +		/* The free space in the last unformatted node of an +		   indirect item if this is an indirect item.  This +		   equals 0xFFFF iff this is a direct item or stat data +		   item. Note that the key, not this field, is used to +		   determine the item type, and thus which field this +		   union contains. */ +		__le16 ih_free_space_reserved; +		/* Iff this is a directory item, this field equals the +		   number of directory entries in the directory item. */ +		__le16 ih_entry_count; +	} __attribute__ ((__packed__)) u; +	__le16 ih_item_len;	/* total size of the item body */ +	__le16 ih_item_location;	/* an offset to the item body +					 * within the block */ +	__le16 ih_version;	/* 0 for all old items, 2 for new +				   ones. Highest bit is set by fsck +				   temporary, cleaned after all +				   done */ +} __attribute__ ((__packed__)); +/* size of item header     */ +#define IH_SIZE (sizeof(struct item_head)) + +#define ih_free_space(ih)            le16_to_cpu((ih)->u.ih_free_space_reserved) +#define ih_version(ih)               le16_to_cpu((ih)->ih_version) +#define ih_entry_count(ih)           le16_to_cpu((ih)->u.ih_entry_count) +#define ih_location(ih)              le16_to_cpu((ih)->ih_item_location) +#define ih_item_len(ih)              le16_to_cpu((ih)->ih_item_len) + +#define put_ih_free_space(ih, val)   do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0) +#define put_ih_version(ih, val)      do { (ih)->ih_version = cpu_to_le16(val); } while (0) +#define put_ih_entry_count(ih, val)  do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0) +#define put_ih_location(ih, val)     do { (ih)->ih_item_location = cpu_to_le16(val); } while (0) +#define put_ih_item_len(ih, val)     do { (ih)->ih_item_len = cpu_to_le16(val); } while (0) + +#define unreachable_item(ih) (ih_version(ih) & (1 << 15)) + +#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih)) +#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val))) + +/* these operate on indirect items, where you've got an array of ints +** at a possibly unaligned location.  These are a noop on ia32 +**  +** p is the array of __u32, i is the index into the array, v is the value +** to store there. +*/ +#define get_block_num(p, i) get_unaligned_le32((p) + (i)) +#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i)) + +// +// in old version uniqueness field shows key type +// +#define V1_SD_UNIQUENESS 0 +#define V1_INDIRECT_UNIQUENESS 0xfffffffe +#define V1_DIRECT_UNIQUENESS 0xffffffff +#define V1_DIRENTRY_UNIQUENESS 500 +#define V1_ANY_UNIQUENESS 555	// FIXME: comment is required + +// +// here are conversion routines +// +static inline int uniqueness2type(__u32 uniqueness) CONSTF; +static inline int uniqueness2type(__u32 uniqueness) +{ +	switch ((int)uniqueness) { +	case V1_SD_UNIQUENESS: +		return TYPE_STAT_DATA; +	case V1_INDIRECT_UNIQUENESS: +		return TYPE_INDIRECT; +	case V1_DIRECT_UNIQUENESS: +		return TYPE_DIRECT; +	case V1_DIRENTRY_UNIQUENESS: +		return TYPE_DIRENTRY; +	case V1_ANY_UNIQUENESS: +	default: +		return TYPE_ANY; +	} +} + +static inline __u32 type2uniqueness(int type) CONSTF; +static inline __u32 type2uniqueness(int type) +{ +	switch (type) { +	case TYPE_STAT_DATA: +		return V1_SD_UNIQUENESS; +	case TYPE_INDIRECT: +		return V1_INDIRECT_UNIQUENESS; +	case TYPE_DIRECT: +		return V1_DIRECT_UNIQUENESS; +	case TYPE_DIRENTRY: +		return V1_DIRENTRY_UNIQUENESS; +	case TYPE_ANY: +	default: +		return V1_ANY_UNIQUENESS; +	} +} + +// +// key is pointer to on disk key which is stored in le, result is cpu, +// there is no way to get version of object from key, so, provide +// version to these defines +// +static inline loff_t le_key_k_offset(int version, +				     const struct reiserfs_key *key) +{ +	return (version == KEY_FORMAT_3_5) ? +	    le32_to_cpu(key->u.k_offset_v1.k_offset) : +	    offset_v2_k_offset(&(key->u.k_offset_v2)); +} + +static inline loff_t le_ih_k_offset(const struct item_head *ih) +{ +	return le_key_k_offset(ih_version(ih), &(ih->ih_key)); +} + +static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key) +{ +	return (version == KEY_FORMAT_3_5) ? +	    uniqueness2type(le32_to_cpu(key->u.k_offset_v1.k_uniqueness)) : +	    offset_v2_k_type(&(key->u.k_offset_v2)); +} + +static inline loff_t le_ih_k_type(const struct item_head *ih) +{ +	return le_key_k_type(ih_version(ih), &(ih->ih_key)); +} + +static inline void set_le_key_k_offset(int version, struct reiserfs_key *key, +				       loff_t offset) +{ +	(version == KEY_FORMAT_3_5) ? (void)(key->u.k_offset_v1.k_offset = cpu_to_le32(offset)) :	/* jdm check */ +	    (void)(set_offset_v2_k_offset(&(key->u.k_offset_v2), offset)); +} + +static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset) +{ +	set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset); +} + +static inline void set_le_key_k_type(int version, struct reiserfs_key *key, +				     int type) +{ +	(version == KEY_FORMAT_3_5) ? +	    (void)(key->u.k_offset_v1.k_uniqueness = +		   cpu_to_le32(type2uniqueness(type))) +	    : (void)(set_offset_v2_k_type(&(key->u.k_offset_v2), type)); +} + +static inline void set_le_ih_k_type(struct item_head *ih, int type) +{ +	set_le_key_k_type(ih_version(ih), &(ih->ih_key), type); +} + +static inline int is_direntry_le_key(int version, struct reiserfs_key *key) +{ +	return le_key_k_type(version, key) == TYPE_DIRENTRY; +} + +static inline int is_direct_le_key(int version, struct reiserfs_key *key) +{ +	return le_key_k_type(version, key) == TYPE_DIRECT; +} + +static inline int is_indirect_le_key(int version, struct reiserfs_key *key) +{ +	return le_key_k_type(version, key) == TYPE_INDIRECT; +} + +static inline int is_statdata_le_key(int version, struct reiserfs_key *key) +{ +	return le_key_k_type(version, key) == TYPE_STAT_DATA; +} + +// +// item header has version. +// +static inline int is_direntry_le_ih(struct item_head *ih) +{ +	return is_direntry_le_key(ih_version(ih), &ih->ih_key); +} + +static inline int is_direct_le_ih(struct item_head *ih) +{ +	return is_direct_le_key(ih_version(ih), &ih->ih_key); +} + +static inline int is_indirect_le_ih(struct item_head *ih) +{ +	return is_indirect_le_key(ih_version(ih), &ih->ih_key); +} + +static inline int is_statdata_le_ih(struct item_head *ih) +{ +	return is_statdata_le_key(ih_version(ih), &ih->ih_key); +} + +// +// key is pointer to cpu key, result is cpu +// +static inline loff_t cpu_key_k_offset(const struct cpu_key *key) +{ +	return key->on_disk_key.k_offset; +} + +static inline loff_t cpu_key_k_type(const struct cpu_key *key) +{ +	return key->on_disk_key.k_type; +} + +static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset) +{ +	key->on_disk_key.k_offset = offset; +} + +static inline void set_cpu_key_k_type(struct cpu_key *key, int type) +{ +	key->on_disk_key.k_type = type; +} + +static inline void cpu_key_k_offset_dec(struct cpu_key *key) +{ +	key->on_disk_key.k_offset--; +} + +#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY) +#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT) +#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT) +#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA) + +/* are these used ? */ +#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key))) +#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key))) +#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key))) +#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key))) + +#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \ +    (!COMP_SHORT_KEYS(ih, key) && \ +	  I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize)) + +/* maximal length of item */ +#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE) +#define MIN_ITEM_LEN 1 + +/* object identifier for root dir */ +#define REISERFS_ROOT_OBJECTID 2 +#define REISERFS_ROOT_PARENT_OBJECTID 1 + +extern struct reiserfs_key root_key; + +/*  + * Picture represents a leaf of the S+tree + *  ______________________________________________________ + * |      |  Array of     |                   |           | + * |Block |  Object-Item  |      F r e e      |  Objects- | + * | head |  Headers      |     S p a c e     |   Items   | + * |______|_______________|___________________|___________| + */ + +/* Header of a disk block.  More precisely, header of a formatted leaf +   or internal node, and not the header of an unformatted node. */ +struct block_head { +	__le16 blk_level;	/* Level of a block in the tree. */ +	__le16 blk_nr_item;	/* Number of keys/items in a block. */ +	__le16 blk_free_space;	/* Block free space in bytes. */ +	__le16 blk_reserved; +	/* dump this in v4/planA */ +	struct reiserfs_key blk_right_delim_key;	/* kept only for compatibility */ +}; + +#define BLKH_SIZE                     (sizeof(struct block_head)) +#define blkh_level(p_blkh)            (le16_to_cpu((p_blkh)->blk_level)) +#define blkh_nr_item(p_blkh)          (le16_to_cpu((p_blkh)->blk_nr_item)) +#define blkh_free_space(p_blkh)       (le16_to_cpu((p_blkh)->blk_free_space)) +#define blkh_reserved(p_blkh)         (le16_to_cpu((p_blkh)->blk_reserved)) +#define set_blkh_level(p_blkh,val)    ((p_blkh)->blk_level = cpu_to_le16(val)) +#define set_blkh_nr_item(p_blkh,val)  ((p_blkh)->blk_nr_item = cpu_to_le16(val)) +#define set_blkh_free_space(p_blkh,val) ((p_blkh)->blk_free_space = cpu_to_le16(val)) +#define set_blkh_reserved(p_blkh,val) ((p_blkh)->blk_reserved = cpu_to_le16(val)) +#define blkh_right_delim_key(p_blkh)  ((p_blkh)->blk_right_delim_key) +#define set_blkh_right_delim_key(p_blkh,val)  ((p_blkh)->blk_right_delim_key = val) + +/* + * values for blk_level field of the struct block_head + */ + +#define FREE_LEVEL 0		/* when node gets removed from the tree its +				   blk_level is set to FREE_LEVEL. It is then +				   used to see whether the node is still in the +				   tree */ + +#define DISK_LEAF_NODE_LEVEL  1	/* Leaf node level. */ + +/* Given the buffer head of a formatted node, resolve to the block head of that node. */ +#define B_BLK_HEAD(bh)			((struct block_head *)((bh)->b_data)) +/* Number of items that are in buffer. */ +#define B_NR_ITEMS(bh)			(blkh_nr_item(B_BLK_HEAD(bh))) +#define B_LEVEL(bh)			(blkh_level(B_BLK_HEAD(bh))) +#define B_FREE_SPACE(bh)		(blkh_free_space(B_BLK_HEAD(bh))) + +#define PUT_B_NR_ITEMS(bh, val)		do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0) +#define PUT_B_LEVEL(bh, val)		do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0) +#define PUT_B_FREE_SPACE(bh, val)	do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0) + +/* Get right delimiting key. -- little endian */ +#define B_PRIGHT_DELIM_KEY(bh)		(&(blk_right_delim_key(B_BLK_HEAD(bh)))) + +/* Does the buffer contain a disk leaf. */ +#define B_IS_ITEMS_LEVEL(bh)		(B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL) + +/* Does the buffer contain a disk internal node */ +#define B_IS_KEYS_LEVEL(bh)      (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \ +					    && B_LEVEL(bh) <= MAX_HEIGHT) + +/***************************************************************************/ +/*                             STAT DATA                                   */ +/***************************************************************************/ + +// +// old stat data is 32 bytes long. We are going to distinguish new one by +// different size +// +struct stat_data_v1 { +	__le16 sd_mode;		/* file type, permissions */ +	__le16 sd_nlink;	/* number of hard links */ +	__le16 sd_uid;		/* owner */ +	__le16 sd_gid;		/* group */ +	__le32 sd_size;		/* file size */ +	__le32 sd_atime;	/* time of last access */ +	__le32 sd_mtime;	/* time file was last modified  */ +	__le32 sd_ctime;	/* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */ +	union { +		__le32 sd_rdev; +		__le32 sd_blocks;	/* number of blocks file uses */ +	} __attribute__ ((__packed__)) u; +	__le32 sd_first_direct_byte;	/* first byte of file which is stored +					   in a direct item: except that if it +					   equals 1 it is a symlink and if it +					   equals ~(__u32)0 there is no +					   direct item.  The existence of this +					   field really grates on me. Let's +					   replace it with a macro based on +					   sd_size and our tail suppression +					   policy.  Someday.  -Hans */ +} __attribute__ ((__packed__)); + +#define SD_V1_SIZE              (sizeof(struct stat_data_v1)) +#define stat_data_v1(ih)        (ih_version (ih) == KEY_FORMAT_3_5) +#define sd_v1_mode(sdp)         (le16_to_cpu((sdp)->sd_mode)) +#define set_sd_v1_mode(sdp,v)   ((sdp)->sd_mode = cpu_to_le16(v)) +#define sd_v1_nlink(sdp)        (le16_to_cpu((sdp)->sd_nlink)) +#define set_sd_v1_nlink(sdp,v)  ((sdp)->sd_nlink = cpu_to_le16(v)) +#define sd_v1_uid(sdp)          (le16_to_cpu((sdp)->sd_uid)) +#define set_sd_v1_uid(sdp,v)    ((sdp)->sd_uid = cpu_to_le16(v)) +#define sd_v1_gid(sdp)          (le16_to_cpu((sdp)->sd_gid)) +#define set_sd_v1_gid(sdp,v)    ((sdp)->sd_gid = cpu_to_le16(v)) +#define sd_v1_size(sdp)         (le32_to_cpu((sdp)->sd_size)) +#define set_sd_v1_size(sdp,v)   ((sdp)->sd_size = cpu_to_le32(v)) +#define sd_v1_atime(sdp)        (le32_to_cpu((sdp)->sd_atime)) +#define set_sd_v1_atime(sdp,v)  ((sdp)->sd_atime = cpu_to_le32(v)) +#define sd_v1_mtime(sdp)        (le32_to_cpu((sdp)->sd_mtime)) +#define set_sd_v1_mtime(sdp,v)  ((sdp)->sd_mtime = cpu_to_le32(v)) +#define sd_v1_ctime(sdp)        (le32_to_cpu((sdp)->sd_ctime)) +#define set_sd_v1_ctime(sdp,v)  ((sdp)->sd_ctime = cpu_to_le32(v)) +#define sd_v1_rdev(sdp)         (le32_to_cpu((sdp)->u.sd_rdev)) +#define set_sd_v1_rdev(sdp,v)   ((sdp)->u.sd_rdev = cpu_to_le32(v)) +#define sd_v1_blocks(sdp)       (le32_to_cpu((sdp)->u.sd_blocks)) +#define set_sd_v1_blocks(sdp,v) ((sdp)->u.sd_blocks = cpu_to_le32(v)) +#define sd_v1_first_direct_byte(sdp) \ +                                (le32_to_cpu((sdp)->sd_first_direct_byte)) +#define set_sd_v1_first_direct_byte(sdp,v) \ +                                ((sdp)->sd_first_direct_byte = cpu_to_le32(v)) + +/* inode flags stored in sd_attrs (nee sd_reserved) */ + +/* we want common flags to have the same values as in ext2, +   so chattr(1) will work without problems */ +#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL +#define REISERFS_APPEND_FL    FS_APPEND_FL +#define REISERFS_SYNC_FL      FS_SYNC_FL +#define REISERFS_NOATIME_FL   FS_NOATIME_FL +#define REISERFS_NODUMP_FL    FS_NODUMP_FL +#define REISERFS_SECRM_FL     FS_SECRM_FL +#define REISERFS_UNRM_FL      FS_UNRM_FL +#define REISERFS_COMPR_FL     FS_COMPR_FL +#define REISERFS_NOTAIL_FL    FS_NOTAIL_FL + +/* persistent flags that file inherits from the parent directory */ +#define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL |	\ +				REISERFS_SYNC_FL |	\ +				REISERFS_NOATIME_FL |	\ +				REISERFS_NODUMP_FL |	\ +				REISERFS_SECRM_FL |	\ +				REISERFS_COMPR_FL |	\ +				REISERFS_NOTAIL_FL ) + +/* Stat Data on disk (reiserfs version of UFS disk inode minus the +   address blocks) */ +struct stat_data { +	__le16 sd_mode;		/* file type, permissions */ +	__le16 sd_attrs;	/* persistent inode flags */ +	__le32 sd_nlink;	/* number of hard links */ +	__le64 sd_size;		/* file size */ +	__le32 sd_uid;		/* owner */ +	__le32 sd_gid;		/* group */ +	__le32 sd_atime;	/* time of last access */ +	__le32 sd_mtime;	/* time file was last modified  */ +	__le32 sd_ctime;	/* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */ +	__le32 sd_blocks; +	union { +		__le32 sd_rdev; +		__le32 sd_generation; +		//__le32 sd_first_direct_byte; +		/* first byte of file which is stored in a +		   direct item: except that if it equals 1 +		   it is a symlink and if it equals +		   ~(__u32)0 there is no direct item.  The +		   existence of this field really grates +		   on me. Let's replace it with a macro +		   based on sd_size and our tail +		   suppression policy? */ +	} __attribute__ ((__packed__)) u; +} __attribute__ ((__packed__)); +// +// this is 44 bytes long +// +#define SD_SIZE (sizeof(struct stat_data)) +#define SD_V2_SIZE              SD_SIZE +#define stat_data_v2(ih)        (ih_version (ih) == KEY_FORMAT_3_6) +#define sd_v2_mode(sdp)         (le16_to_cpu((sdp)->sd_mode)) +#define set_sd_v2_mode(sdp,v)   ((sdp)->sd_mode = cpu_to_le16(v)) +/* sd_reserved */ +/* set_sd_reserved */ +#define sd_v2_nlink(sdp)        (le32_to_cpu((sdp)->sd_nlink)) +#define set_sd_v2_nlink(sdp,v)  ((sdp)->sd_nlink = cpu_to_le32(v)) +#define sd_v2_size(sdp)         (le64_to_cpu((sdp)->sd_size)) +#define set_sd_v2_size(sdp,v)   ((sdp)->sd_size = cpu_to_le64(v)) +#define sd_v2_uid(sdp)          (le32_to_cpu((sdp)->sd_uid)) +#define set_sd_v2_uid(sdp,v)    ((sdp)->sd_uid = cpu_to_le32(v)) +#define sd_v2_gid(sdp)          (le32_to_cpu((sdp)->sd_gid)) +#define set_sd_v2_gid(sdp,v)    ((sdp)->sd_gid = cpu_to_le32(v)) +#define sd_v2_atime(sdp)        (le32_to_cpu((sdp)->sd_atime)) +#define set_sd_v2_atime(sdp,v)  ((sdp)->sd_atime = cpu_to_le32(v)) +#define sd_v2_mtime(sdp)        (le32_to_cpu((sdp)->sd_mtime)) +#define set_sd_v2_mtime(sdp,v)  ((sdp)->sd_mtime = cpu_to_le32(v)) +#define sd_v2_ctime(sdp)        (le32_to_cpu((sdp)->sd_ctime)) +#define set_sd_v2_ctime(sdp,v)  ((sdp)->sd_ctime = cpu_to_le32(v)) +#define sd_v2_blocks(sdp)       (le32_to_cpu((sdp)->sd_blocks)) +#define set_sd_v2_blocks(sdp,v) ((sdp)->sd_blocks = cpu_to_le32(v)) +#define sd_v2_rdev(sdp)         (le32_to_cpu((sdp)->u.sd_rdev)) +#define set_sd_v2_rdev(sdp,v)   ((sdp)->u.sd_rdev = cpu_to_le32(v)) +#define sd_v2_generation(sdp)   (le32_to_cpu((sdp)->u.sd_generation)) +#define set_sd_v2_generation(sdp,v) ((sdp)->u.sd_generation = cpu_to_le32(v)) +#define sd_v2_attrs(sdp)         (le16_to_cpu((sdp)->sd_attrs)) +#define set_sd_v2_attrs(sdp,v)   ((sdp)->sd_attrs = cpu_to_le16(v)) + +/***************************************************************************/ +/*                      DIRECTORY STRUCTURE                                */ +/***************************************************************************/ +/*  +   Picture represents the structure of directory items +   ________________________________________________ +   |  Array of     |   |     |        |       |   | +   | directory     |N-1| N-2 | ....   |   1st |0th| +   | entry headers |   |     |        |       |   | +   |_______________|___|_____|________|_______|___| +                    <----   directory entries         ------> + + First directory item has k_offset component 1. We store "." and ".." + in one item, always, we never split "." and ".." into differing + items.  This makes, among other things, the code for removing + directories simpler. */ +#define SD_OFFSET  0 +#define SD_UNIQUENESS 0 +#define DOT_OFFSET 1 +#define DOT_DOT_OFFSET 2 +#define DIRENTRY_UNIQUENESS 500 + +/* */ +#define FIRST_ITEM_OFFSET 1 + +/* +   Q: How to get key of object pointed to by entry from entry?   + +   A: Each directory entry has its header. This header has deh_dir_id and deh_objectid fields, those are key +      of object, entry points to */ + +/* NOT IMPLEMENTED:    +   Directory will someday contain stat data of object */ + +struct reiserfs_de_head { +	__le32 deh_offset;	/* third component of the directory entry key */ +	__le32 deh_dir_id;	/* objectid of the parent directory of the object, that is referenced +				   by directory entry */ +	__le32 deh_objectid;	/* objectid of the object, that is referenced by directory entry */ +	__le16 deh_location;	/* offset of name in the whole item */ +	__le16 deh_state;	/* whether 1) entry contains stat data (for future), and 2) whether +				   entry is hidden (unlinked) */ +} __attribute__ ((__packed__)); +#define DEH_SIZE                  sizeof(struct reiserfs_de_head) +#define deh_offset(p_deh)         (le32_to_cpu((p_deh)->deh_offset)) +#define deh_dir_id(p_deh)         (le32_to_cpu((p_deh)->deh_dir_id)) +#define deh_objectid(p_deh)       (le32_to_cpu((p_deh)->deh_objectid)) +#define deh_location(p_deh)       (le16_to_cpu((p_deh)->deh_location)) +#define deh_state(p_deh)          (le16_to_cpu((p_deh)->deh_state)) + +#define put_deh_offset(p_deh,v)   ((p_deh)->deh_offset = cpu_to_le32((v))) +#define put_deh_dir_id(p_deh,v)   ((p_deh)->deh_dir_id = cpu_to_le32((v))) +#define put_deh_objectid(p_deh,v) ((p_deh)->deh_objectid = cpu_to_le32((v))) +#define put_deh_location(p_deh,v) ((p_deh)->deh_location = cpu_to_le16((v))) +#define put_deh_state(p_deh,v)    ((p_deh)->deh_state = cpu_to_le16((v))) + +/* empty directory contains two entries "." and ".." and their headers */ +#define EMPTY_DIR_SIZE \ +(DEH_SIZE * 2 + ROUND_UP (strlen (".")) + ROUND_UP (strlen (".."))) + +/* old format directories have this size when empty */ +#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3) + +#define DEH_Statdata 0		/* not used now */ +#define DEH_Visible 2 + +/* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */ +#if BITS_PER_LONG == 64 || defined(__s390__) || defined(__hppa__) +#   define ADDR_UNALIGNED_BITS  (3) +#endif + +/* These are only used to manipulate deh_state. + * Because of this, we'll use the ext2_ bit routines, + * since they are little endian */ +#ifdef ADDR_UNALIGNED_BITS + +#   define aligned_address(addr)           ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1))) +#   define unaligned_offset(addr)          (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3) + +#   define set_bit_unaligned(nr, addr)	\ +	__test_and_set_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) +#   define clear_bit_unaligned(nr, addr)	\ +	__test_and_clear_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) +#   define test_bit_unaligned(nr, addr)	\ +	test_bit_le((nr) + unaligned_offset(addr), aligned_address(addr)) + +#else + +#   define set_bit_unaligned(nr, addr)	__test_and_set_bit_le(nr, addr) +#   define clear_bit_unaligned(nr, addr)	__test_and_clear_bit_le(nr, addr) +#   define test_bit_unaligned(nr, addr)	test_bit_le(nr, addr) + +#endif + +#define mark_de_with_sd(deh)        set_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define mark_de_without_sd(deh)     clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define mark_de_visible(deh)	    set_bit_unaligned (DEH_Visible, &((deh)->deh_state)) +#define mark_de_hidden(deh)	    clear_bit_unaligned (DEH_Visible, &((deh)->deh_state)) + +#define de_with_sd(deh)		    test_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define de_visible(deh)	    	    test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) +#define de_hidden(deh)	    	    !test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) + +extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, +				   __le32 par_dirid, __le32 par_objid); +extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, +				__le32 par_dirid, __le32 par_objid); + +/* array of the entry headers */ + /* get item body */ +#define B_I_PITEM(bh,ih) ( (bh)->b_data + ih_location(ih) ) +#define B_I_DEH(bh,ih) ((struct reiserfs_de_head *)(B_I_PITEM(bh,ih))) + +/* length of the directory entry in directory item. This define +   calculates length of i-th directory entry using directory entry +   locations from dir entry head. When it calculates length of 0-th +   directory entry, it uses length of whole item in place of entry +   location of the non-existent following entry in the calculation. +   See picture above.*/ +/* +#define I_DEH_N_ENTRY_LENGTH(ih,deh,i) \ +((i) ? (deh_location((deh)-1) - deh_location((deh))) : (ih_item_len((ih)) - deh_location((deh)))) +*/ +static inline int entry_length(const struct buffer_head *bh, +			       const struct item_head *ih, int pos_in_item) +{ +	struct reiserfs_de_head *deh; + +	deh = B_I_DEH(bh, ih) + pos_in_item; +	if (pos_in_item) +		return deh_location(deh - 1) - deh_location(deh); + +	return ih_item_len(ih) - deh_location(deh); +} + +/* number of entries in the directory item, depends on ENTRY_COUNT being at the start of directory dynamic data. */ +#define I_ENTRY_COUNT(ih) (ih_entry_count((ih))) + +/* name by bh, ih and entry_num */ +#define B_I_E_NAME(bh,ih,entry_num) ((char *)(bh->b_data + ih_location(ih) + deh_location(B_I_DEH(bh,ih)+(entry_num)))) + +// two entries per block (at least) +#define REISERFS_MAX_NAME(block_size) 255 + +/* this structure is used for operations on directory entries. It is +   not a disk structure. */ +/* When reiserfs_find_entry or search_by_entry_key find directory +   entry, they return filled reiserfs_dir_entry structure */ +struct reiserfs_dir_entry { +	struct buffer_head *de_bh; +	int de_item_num; +	struct item_head *de_ih; +	int de_entry_num; +	struct reiserfs_de_head *de_deh; +	int de_entrylen; +	int de_namelen; +	char *de_name; +	unsigned long *de_gen_number_bit_string; + +	__u32 de_dir_id; +	__u32 de_objectid; + +	struct cpu_key de_entry_key; +}; + +/* these defines are useful when a particular member of a reiserfs_dir_entry is needed */ + +/* pointer to file name, stored in entry */ +#define B_I_DEH_ENTRY_FILE_NAME(bh,ih,deh) (B_I_PITEM (bh, ih) + deh_location(deh)) + +/* length of name */ +#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \ +(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0)) + +/* hash value occupies bits from 7 up to 30 */ +#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL) +/* generation number occupies 7 bits starting from 0 up to 6 */ +#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL) +#define MAX_GENERATION_NUMBER  127 + +#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number)) + +/* + * Picture represents an internal node of the reiserfs tree + *  ______________________________________________________ + * |      |  Array of     |  Array of         |  Free     | + * |block |    keys       |  pointers         | space     | + * | head |      N        |      N+1          |           | + * |______|_______________|___________________|___________| + */ + +/***************************************************************************/ +/*                      DISK CHILD                                         */ +/***************************************************************************/ +/* Disk child pointer: The pointer from an internal node of the tree +   to a node that is on disk. */ +struct disk_child { +	__le32 dc_block_number;	/* Disk child's block number. */ +	__le16 dc_size;		/* Disk child's used space.   */ +	__le16 dc_reserved; +}; + +#define DC_SIZE (sizeof(struct disk_child)) +#define dc_block_number(dc_p)	(le32_to_cpu((dc_p)->dc_block_number)) +#define dc_size(dc_p)		(le16_to_cpu((dc_p)->dc_size)) +#define put_dc_block_number(dc_p, val)   do { (dc_p)->dc_block_number = cpu_to_le32(val); } while(0) +#define put_dc_size(dc_p, val)   do { (dc_p)->dc_size = cpu_to_le16(val); } while(0) + +/* Get disk child by buffer header and position in the tree node. */ +#define B_N_CHILD(bh, n_pos)  ((struct disk_child *)\ +((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos))) + +/* Get disk child number by buffer header and position in the tree node. */ +#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos))) +#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \ +				(put_dc_block_number(B_N_CHILD(bh, n_pos), val)) + + /* maximal value of field child_size in structure disk_child */ + /* child size is the combined size of all items and their headers */ +#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE )) + +/* amount of used space in buffer (not including block head) */ +#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur))) + +/* max and min number of keys in internal node */ +#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) ) +#define MIN_NR_KEY(bh)    (MAX_NR_KEY(bh)/2) + +/***************************************************************************/ +/*                      PATH STRUCTURES AND DEFINES                        */ +/***************************************************************************/ + +/* Search_by_key fills up the path from the root to the leaf as it descends the tree looking for the +   key.  It uses reiserfs_bread to try to find buffers in the cache given their block number.  If it +   does not find them in the cache it reads them from disk.  For each node search_by_key finds using +   reiserfs_bread it then uses bin_search to look through that node.  bin_search will find the +   position of the block_number of the next node if it is looking through an internal node.  If it +   is looking through a leaf node bin_search will find the position of the item which has key either +   equal to given key, or which is the maximal key less than the given key. */ + +struct path_element { +	struct buffer_head *pe_buffer;	/* Pointer to the buffer at the path in the tree. */ +	int pe_position;	/* Position in the tree node which is placed in the */ +	/* buffer above.                                  */ +}; + +#define MAX_HEIGHT 5		/* maximal height of a tree. don't change this without changing JOURNAL_PER_BALANCE_CNT */ +#define EXTENDED_MAX_HEIGHT         7	/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */ +#define FIRST_PATH_ELEMENT_OFFSET   2	/* Must be equal to at least 2. */ + +#define ILLEGAL_PATH_ELEMENT_OFFSET 1	/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */ +#define MAX_FEB_SIZE 6		/* this MUST be MAX_HEIGHT + 1. See about FEB below */ + +/* We need to keep track of who the ancestors of nodes are.  When we +   perform a search we record which nodes were visited while +   descending the tree looking for the node we searched for. This list +   of nodes is called the path.  This information is used while +   performing balancing.  Note that this path information may become +   invalid, and this means we must check it when using it to see if it +   is still valid. You'll need to read search_by_key and the comments +   in it, especially about decrement_counters_in_path(), to understand +   this structure.   + +Paths make the code so much harder to work with and debug.... An +enormous number of bugs are due to them, and trying to write or modify +code that uses them just makes my head hurt.  They are based on an +excessive effort to avoid disturbing the precious VFS code.:-( The +gods only know how we are going to SMP the code that uses them. +znodes are the way! */ + +#define PATH_READA	0x1	/* do read ahead */ +#define PATH_READA_BACK 0x2	/* read backwards */ + +struct treepath { +	int path_length;	/* Length of the array above.   */ +	int reada; +	struct path_element path_elements[EXTENDED_MAX_HEIGHT];	/* Array of the path elements.  */ +	int pos_in_item; +}; + +#define pos_in_item(path) ((path)->pos_in_item) + +#define INITIALIZE_PATH(var) \ +struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,} + +/* Get path element by path and path position. */ +#define PATH_OFFSET_PELEMENT(path, n_offset)  ((path)->path_elements + (n_offset)) + +/* Get buffer header at the path by path and path position. */ +#define PATH_OFFSET_PBUFFER(path, n_offset)   (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer) + +/* Get position in the element at the path by path and path position. */ +#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position) + +#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length)) +				/* you know, to the person who didn't +				   write this the macro name does not +				   at first suggest what it does. +				   Maybe POSITION_FROM_PATH_END? Or +				   maybe we should just focus on +				   dumping paths... -Hans */ +#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length)) + +#define PATH_PITEM_HEAD(path)    B_N_PITEM_HEAD(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)) + +/* in do_balance leaf has h == 0 in contrast with path structure, +   where root has level == 0. That is why we need these defines */ +#define PATH_H_PBUFFER(path, h) PATH_OFFSET_PBUFFER (path, path->path_length - (h))	/* tb->S[h] */ +#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER (path, (h) + 1)	/* tb->F[h] or tb->S[0]->b_parent */ +#define PATH_H_POSITION(path, h) PATH_OFFSET_POSITION (path, path->path_length - (h)) +#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)	/* tb->S[h]->b_item_order */ + +#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h)) + +#define get_last_bh(path) PATH_PLAST_BUFFER(path) +#define get_ih(path) PATH_PITEM_HEAD(path) +#define get_item_pos(path) PATH_LAST_POSITION(path) +#define get_item(path) ((void *)B_N_PITEM(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION (path))) +#define item_moved(ih,path) comp_items(ih, path) +#define path_changed(ih,path) comp_items (ih, path) + +/***************************************************************************/ +/*                       MISC                                              */ +/***************************************************************************/ + +/* Size of pointer to the unformatted node. */ +#define UNFM_P_SIZE (sizeof(unp_t)) +#define UNFM_P_SHIFT 2 + +// in in-core inode key is stored on le form +#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key)) + +#define MAX_UL_INT 0xffffffff +#define MAX_INT    0x7ffffff +#define MAX_US_INT 0xffff + +// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset +#define U32_MAX (~(__u32)0) + +static inline loff_t max_reiserfs_offset(struct inode *inode) +{ +	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5) +		return (loff_t) U32_MAX; + +	return (loff_t) ((~(__u64) 0) >> 4); +} + +/*#define MAX_KEY_UNIQUENESS	MAX_UL_INT*/ +#define MAX_KEY_OBJECTID	MAX_UL_INT + +#define MAX_B_NUM  MAX_UL_INT +#define MAX_FC_NUM MAX_US_INT + +/* the purpose is to detect overflow of an unsigned short */ +#define REISERFS_LINK_MAX (MAX_US_INT - 1000) + +/* The following defines are used in reiserfs_insert_item and reiserfs_append_item  */ +#define REISERFS_KERNEL_MEM		0	/* reiserfs kernel memory mode  */ +#define REISERFS_USER_MEM		1	/* reiserfs user memory mode            */ + +#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter) +#define get_generation(s) atomic_read (&fs_generation(s)) +#define FILESYSTEM_CHANGED_TB(tb)  (get_generation((tb)->tb_sb) != (tb)->fs_gen) +#define __fs_changed(gen,s) (gen != get_generation (s)) +#define fs_changed(gen,s)		\ +({					\ +	reiserfs_cond_resched(s);	\ +	__fs_changed(gen, s);		\ +}) + +/***************************************************************************/ +/*                  FIXATE NODES                                           */ +/***************************************************************************/ + +#define VI_TYPE_LEFT_MERGEABLE 1 +#define VI_TYPE_RIGHT_MERGEABLE 2 + +/* To make any changes in the tree we always first find node, that +   contains item to be changed/deleted or place to insert a new +   item. We call this node S. To do balancing we need to decide what +   we will shift to left/right neighbor, or to a new node, where new +   item will be etc. To make this analysis simpler we build virtual +   node. Virtual node is an array of items, that will replace items of +   node S. (For instance if we are going to delete an item, virtual +   node does not contain it). Virtual node keeps information about +   item sizes and types, mergeability of first and last items, sizes +   of all entries in directory item. We use this array of items when +   calculating what we can shift to neighbors and how many nodes we +   have to have if we do not any shiftings, if we shift to left/right +   neighbor or to both. */ +struct virtual_item { +	int vi_index;		// index in the array of item operations +	unsigned short vi_type;	// left/right mergeability +	unsigned short vi_item_len;	/* length of item that it will have after balancing */ +	struct item_head *vi_ih; +	const char *vi_item;	// body of item (old or new) +	const void *vi_new_data;	// 0 always but paste mode +	void *vi_uarea;		// item specific area +}; + +struct virtual_node { +	char *vn_free_ptr;	/* this is a pointer to the free space in the buffer */ +	unsigned short vn_nr_item;	/* number of items in virtual node */ +	short vn_size;		/* size of node , that node would have if it has unlimited size and no balancing is performed */ +	short vn_mode;		/* mode of balancing (paste, insert, delete, cut) */ +	short vn_affected_item_num; +	short vn_pos_in_item; +	struct item_head *vn_ins_ih;	/* item header of inserted item, 0 for other modes */ +	const void *vn_data; +	struct virtual_item *vn_vi;	/* array of items (including a new one, excluding item to be deleted) */ +}; + +/* used by directory items when creating virtual nodes */ +struct direntry_uarea { +	int flags; +	__u16 entry_count; +	__u16 entry_sizes[1]; +} __attribute__ ((__packed__)); + +/***************************************************************************/ +/*                  TREE BALANCE                                           */ +/***************************************************************************/ + +/* This temporary structure is used in tree balance algorithms, and +   constructed as we go to the extent that its various parts are +   needed.  It contains arrays of nodes that can potentially be +   involved in the balancing of node S, and parameters that define how +   each of the nodes must be balanced.  Note that in these algorithms +   for balancing the worst case is to need to balance the current node +   S and the left and right neighbors and all of their parents plus +   create a new node.  We implement S1 balancing for the leaf nodes +   and S0 balancing for the internal nodes (S1 and S0 are defined in +   our papers.)*/ + +#define MAX_FREE_BLOCK 7	/* size of the array of buffers to free at end of do_balance */ + +/* maximum number of FEB blocknrs on a single level */ +#define MAX_AMOUNT_NEEDED 2 + +/* someday somebody will prefix every field in this struct with tb_ */ +struct tree_balance { +	int tb_mode; +	int need_balance_dirty; +	struct super_block *tb_sb; +	struct reiserfs_transaction_handle *transaction_handle; +	struct treepath *tb_path; +	struct buffer_head *L[MAX_HEIGHT];	/* array of left neighbors of nodes in the path */ +	struct buffer_head *R[MAX_HEIGHT];	/* array of right neighbors of nodes in the path */ +	struct buffer_head *FL[MAX_HEIGHT];	/* array of fathers of the left  neighbors      */ +	struct buffer_head *FR[MAX_HEIGHT];	/* array of fathers of the right neighbors      */ +	struct buffer_head *CFL[MAX_HEIGHT];	/* array of common parents of center node and its left neighbor  */ +	struct buffer_head *CFR[MAX_HEIGHT];	/* array of common parents of center node and its right neighbor */ + +	struct buffer_head *FEB[MAX_FEB_SIZE];	/* array of empty buffers. Number of buffers in array equals +						   cur_blknum. */ +	struct buffer_head *used[MAX_FEB_SIZE]; +	struct buffer_head *thrown[MAX_FEB_SIZE]; +	int lnum[MAX_HEIGHT];	/* array of number of items which must be +				   shifted to the left in order to balance the +				   current node; for leaves includes item that +				   will be partially shifted; for internal +				   nodes, it is the number of child pointers +				   rather than items. It includes the new item +				   being created. The code sometimes subtracts +				   one to get the number of wholly shifted +				   items for other purposes. */ +	int rnum[MAX_HEIGHT];	/* substitute right for left in comment above */ +	int lkey[MAX_HEIGHT];	/* array indexed by height h mapping the key delimiting L[h] and +				   S[h] to its item number within the node CFL[h] */ +	int rkey[MAX_HEIGHT];	/* substitute r for l in comment above */ +	int insert_size[MAX_HEIGHT];	/* the number of bytes by we are trying to add or remove from +					   S[h]. A negative value means removing.  */ +	int blknum[MAX_HEIGHT];	/* number of nodes that will replace node S[h] after +				   balancing on the level h of the tree.  If 0 then S is +				   being deleted, if 1 then S is remaining and no new nodes +				   are being created, if 2 or 3 then 1 or 2 new nodes is +				   being created */ + +	/* fields that are used only for balancing leaves of the tree */ +	int cur_blknum;		/* number of empty blocks having been already allocated                 */ +	int s0num;		/* number of items that fall into left most  node when S[0] splits     */ +	int s1num;		/* number of items that fall into first  new node when S[0] splits     */ +	int s2num;		/* number of items that fall into second new node when S[0] splits     */ +	int lbytes;		/* number of bytes which can flow to the left neighbor from the        left    */ +	/* most liquid item that cannot be shifted from S[0] entirely         */ +	/* if -1 then nothing will be partially shifted */ +	int rbytes;		/* number of bytes which will flow to the right neighbor from the right        */ +	/* most liquid item that cannot be shifted from S[0] entirely         */ +	/* if -1 then nothing will be partially shifted                           */ +	int s1bytes;		/* number of bytes which flow to the first  new node when S[0] splits   */ +	/* note: if S[0] splits into 3 nodes, then items do not need to be cut  */ +	int s2bytes; +	struct buffer_head *buf_to_free[MAX_FREE_BLOCK];	/* buffers which are to be freed after do_balance finishes by unfix_nodes */ +	char *vn_buf;		/* kmalloced memory. Used to create +				   virtual node and keep map of +				   dirtied bitmap blocks */ +	int vn_buf_size;	/* size of the vn_buf */ +	struct virtual_node *tb_vn;	/* VN starts after bitmap of bitmap blocks */ + +	int fs_gen;		/* saved value of `reiserfs_generation' counter +				   see FILESYSTEM_CHANGED() macro in reiserfs_fs.h */ +#ifdef DISPLACE_NEW_PACKING_LOCALITIES +	struct in_core_key key;	/* key pointer, to pass to block allocator or +				   another low-level subsystem */ +#endif +}; + +/* These are modes of balancing */ + +/* When inserting an item. */ +#define M_INSERT	'i' +/* When inserting into (directories only) or appending onto an already +   existent item. */ +#define M_PASTE		'p' +/* When deleting an item. */ +#define M_DELETE	'd' +/* When truncating an item or removing an entry from a (directory) item. */ +#define M_CUT 		'c' + +/* used when balancing on leaf level skipped (in reiserfsck) */ +#define M_INTERNAL	'n' + +/* When further balancing is not needed, then do_balance does not need +   to be called. */ +#define M_SKIP_BALANCING 		's' +#define M_CONVERT	'v' + +/* modes of leaf_move_items */ +#define LEAF_FROM_S_TO_L 0 +#define LEAF_FROM_S_TO_R 1 +#define LEAF_FROM_R_TO_L 2 +#define LEAF_FROM_L_TO_R 3 +#define LEAF_FROM_S_TO_SNEW 4 + +#define FIRST_TO_LAST 0 +#define LAST_TO_FIRST 1 + +/* used in do_balance for passing parent of node information that has +   been gotten from tb struct */ +struct buffer_info { +	struct tree_balance *tb; +	struct buffer_head *bi_bh; +	struct buffer_head *bi_parent; +	int bi_position; +}; + +static inline struct super_block *sb_from_tb(struct tree_balance *tb) +{ +	return tb ? tb->tb_sb : NULL; +} + +static inline struct super_block *sb_from_bi(struct buffer_info *bi) +{ +	return bi ? sb_from_tb(bi->tb) : NULL; +} + +/* there are 4 types of items: stat data, directory item, indirect, direct. ++-------------------+------------+--------------+------------+ +|	            |  k_offset  | k_uniqueness | mergeable? | ++-------------------+------------+--------------+------------+ +|     stat data     |	0        |      0       |   no       | ++-------------------+------------+--------------+------------+ +| 1st directory item| DOT_OFFSET |DIRENTRY_UNIQUENESS|   no       |  +| non 1st directory | hash value |              |   yes      | +|     item          |            |              |            | ++-------------------+------------+--------------+------------+ +| indirect item     | offset + 1 |TYPE_INDIRECT |   if this is not the first indirect item of the object ++-------------------+------------+--------------+------------+ +| direct item       | offset + 1 |TYPE_DIRECT   | if not this is not the first direct item of the object ++-------------------+------------+--------------+------------+ +*/ + +struct item_operations { +	int (*bytes_number) (struct item_head * ih, int block_size); +	void (*decrement_key) (struct cpu_key *); +	int (*is_left_mergeable) (struct reiserfs_key * ih, +				  unsigned long bsize); +	void (*print_item) (struct item_head *, char *item); +	void (*check_item) (struct item_head *, char *item); + +	int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi, +			  int is_affected, int insert_size); +	int (*check_left) (struct virtual_item * vi, int free, +			   int start_skip, int end_skip); +	int (*check_right) (struct virtual_item * vi, int free); +	int (*part_size) (struct virtual_item * vi, int from, int to); +	int (*unit_num) (struct virtual_item * vi); +	void (*print_vi) (struct virtual_item * vi); +}; + +extern struct item_operations *item_ops[TYPE_ANY + 1]; + +#define op_bytes_number(ih,bsize)                    item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize) +#define op_is_left_mergeable(key,bsize)              item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize) +#define op_print_item(ih,item)                       item_ops[le_ih_k_type (ih)]->print_item (ih, item) +#define op_check_item(ih,item)                       item_ops[le_ih_k_type (ih)]->check_item (ih, item) +#define op_create_vi(vn,vi,is_affected,insert_size)  item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size) +#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip) +#define op_check_right(vi,free)                      item_ops[(vi)->vi_index]->check_right (vi, free) +#define op_part_size(vi,from,to)                     item_ops[(vi)->vi_index]->part_size (vi, from, to) +#define op_unit_num(vi)				     item_ops[(vi)->vi_index]->unit_num (vi) +#define op_print_vi(vi)                              item_ops[(vi)->vi_index]->print_vi (vi) + +#define COMP_SHORT_KEYS comp_short_keys + +/* number of blocks pointed to by the indirect item */ +#define I_UNFM_NUM(ih)	(ih_item_len(ih) / UNFM_P_SIZE) + +/* the used space within the unformatted node corresponding to pos within the item pointed to by ih */ +#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size)) + +/* number of bytes contained by the direct item or the unformatted nodes the indirect item points to */ + +/* get the item header */ +#define B_N_PITEM_HEAD(bh,item_num) ( (struct item_head * )((bh)->b_data + BLKH_SIZE) + (item_num) ) + +/* get key */ +#define B_N_PDELIM_KEY(bh,item_num) ( (struct reiserfs_key * )((bh)->b_data + BLKH_SIZE) + (item_num) ) + +/* get the key */ +#define B_N_PKEY(bh,item_num) ( &(B_N_PITEM_HEAD(bh,item_num)->ih_key) ) + +/* get item body */ +#define B_N_PITEM(bh,item_num) ( (bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(item_num)))) + +/* get the stat data by the buffer header and the item order */ +#define B_N_STAT_DATA(bh,nr) \ +( (struct stat_data *)((bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(nr))) ) ) + +    /* following defines use reiserfs buffer header and item header */ + +/* get stat-data */ +#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) ) + +// this is 3976 for size==4096 +#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE) + +/* indirect items consist of entries which contain blocknrs, pos +   indicates which entry, and B_I_POS_UNFM_POINTER resolves to the +   blocknr contained by the entry pos points to */ +#define B_I_POS_UNFM_POINTER(bh,ih,pos) le32_to_cpu(*(((unp_t *)B_I_PITEM(bh,ih)) + (pos))) +#define PUT_B_I_POS_UNFM_POINTER(bh,ih,pos, val) do {*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)) = cpu_to_le32(val); } while (0) + +struct reiserfs_iget_args { +	__u32 objectid; +	__u32 dirid; +}; + +/***************************************************************************/ +/*                    FUNCTION DECLARATIONS                                */ +/***************************************************************************/ + +#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) + +#define journal_trans_half(blocksize) \ +	((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32)) + +/* journal.c see journal.c for all the comments here */ + +/* first block written in a commit.  */ +struct reiserfs_journal_desc { +	__le32 j_trans_id;	/* id of commit */ +	__le32 j_len;		/* length of commit. len +1 is the commit block */ +	__le32 j_mount_id;	/* mount id of this trans */ +	__le32 j_realblock[1];	/* real locations for each block */ +}; + +#define get_desc_trans_id(d)   le32_to_cpu((d)->j_trans_id) +#define get_desc_trans_len(d)  le32_to_cpu((d)->j_len) +#define get_desc_mount_id(d)   le32_to_cpu((d)->j_mount_id) + +#define set_desc_trans_id(d,val)       do { (d)->j_trans_id = cpu_to_le32 (val); } while (0) +#define set_desc_trans_len(d,val)      do { (d)->j_len = cpu_to_le32 (val); } while (0) +#define set_desc_mount_id(d,val)       do { (d)->j_mount_id = cpu_to_le32 (val); } while (0) + +/* last block written in a commit */ +struct reiserfs_journal_commit { +	__le32 j_trans_id;	/* must match j_trans_id from the desc block */ +	__le32 j_len;		/* ditto */ +	__le32 j_realblock[1];	/* real locations for each block */ +}; + +#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id) +#define get_commit_trans_len(c)        le32_to_cpu((c)->j_len) +#define get_commit_mount_id(c) le32_to_cpu((c)->j_mount_id) + +#define set_commit_trans_id(c,val)     do { (c)->j_trans_id = cpu_to_le32 (val); } while (0) +#define set_commit_trans_len(c,val)    do { (c)->j_len = cpu_to_le32 (val); } while (0) + +/* this header block gets written whenever a transaction is considered fully flushed, and is more recent than the +** last fully flushed transaction.  fully flushed means all the log blocks and all the real blocks are on disk, +** and this transaction does not need to be replayed. +*/ +struct reiserfs_journal_header { +	__le32 j_last_flush_trans_id;	/* id of last fully flushed transaction */ +	__le32 j_first_unflushed_offset;	/* offset in the log of where to start replay after a crash */ +	__le32 j_mount_id; +	/* 12 */ struct journal_params jh_journal; +}; + +/* biggest tunable defines are right here */ +#define JOURNAL_BLOCK_COUNT 8192	/* number of blocks in the journal */ +#define JOURNAL_TRANS_MAX_DEFAULT 1024	/* biggest possible single transaction, don't change for now (8/3/99) */ +#define JOURNAL_TRANS_MIN_DEFAULT 256 +#define JOURNAL_MAX_BATCH_DEFAULT   900	/* max blocks to batch into one transaction, don't make this any bigger than 900 */ +#define JOURNAL_MIN_RATIO 2 +#define JOURNAL_MAX_COMMIT_AGE 30 +#define JOURNAL_MAX_TRANS_AGE 30 +#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9) +#define JOURNAL_BLOCKS_PER_OBJECT(sb)  (JOURNAL_PER_BALANCE_CNT * 3 + \ +					 2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \ +					      REISERFS_QUOTA_TRANS_BLOCKS(sb))) + +#ifdef CONFIG_QUOTA +#define REISERFS_QUOTA_OPTS ((1 << REISERFS_USRQUOTA) | (1 << REISERFS_GRPQUOTA)) +/* We need to update data and inode (atime) */ +#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? 2 : 0) +/* 1 balancing, 1 bitmap, 1 data per write + stat data update */ +#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \ +(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0) +/* same as with INIT */ +#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \ +(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0) +#else +#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0 +#define REISERFS_QUOTA_INIT_BLOCKS(s) 0 +#define REISERFS_QUOTA_DEL_BLOCKS(s) 0 +#endif + +/* both of these can be as low as 1, or as high as you want.  The min is the +** number of 4k bitmap nodes preallocated on mount. New nodes are allocated +** as needed, and released when transactions are committed.  On release, if  +** the current number of nodes is > max, the node is freed, otherwise,  +** it is put on a free list for faster use later. +*/ +#define REISERFS_MIN_BITMAP_NODES 10 +#define REISERFS_MAX_BITMAP_NODES 100 + +#define JBH_HASH_SHIFT 13	/* these are based on journal hash size of 8192 */ +#define JBH_HASH_MASK 8191 + +#define _jhashfn(sb,block)	\ +	(((unsigned long)sb>>L1_CACHE_SHIFT) ^ \ +	 (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12)))) +#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK]) + +// We need these to make journal.c code more readable +#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) +#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) +#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) + +enum reiserfs_bh_state_bits { +	BH_JDirty = BH_PrivateStart,	/* buffer is in current transaction */ +	BH_JDirty_wait, +	BH_JNew,		/* disk block was taken off free list before +				 * being in a finished transaction, or +				 * written to disk. Can be reused immed. */ +	BH_JPrepared, +	BH_JRestore_dirty, +	BH_JTest,		// debugging only will go away +}; + +BUFFER_FNS(JDirty, journaled); +TAS_BUFFER_FNS(JDirty, journaled); +BUFFER_FNS(JDirty_wait, journal_dirty); +TAS_BUFFER_FNS(JDirty_wait, journal_dirty); +BUFFER_FNS(JNew, journal_new); +TAS_BUFFER_FNS(JNew, journal_new); +BUFFER_FNS(JPrepared, journal_prepared); +TAS_BUFFER_FNS(JPrepared, journal_prepared); +BUFFER_FNS(JRestore_dirty, journal_restore_dirty); +TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty); +BUFFER_FNS(JTest, journal_test); +TAS_BUFFER_FNS(JTest, journal_test); + +/* +** transaction handle which is passed around for all journal calls +*/ +struct reiserfs_transaction_handle { +	struct super_block *t_super;	/* super for this FS when journal_begin was +					   called. saves calls to reiserfs_get_super +					   also used by nested transactions to make +					   sure they are nesting on the right FS +					   _must_ be first in the handle +					 */ +	int t_refcount; +	int t_blocks_logged;	/* number of blocks this writer has logged */ +	int t_blocks_allocated;	/* number of blocks this writer allocated */ +	unsigned int t_trans_id;	/* sanity check, equals the current trans id */ +	void *t_handle_save;	/* save existing current->journal_info */ +	unsigned displace_new_blocks:1;	/* if new block allocation occurres, that block +					   should be displaced from others */ +	struct list_head t_list; +}; + +/* used to keep track of ordered and tail writes, attached to the buffer + * head through b_journal_head. + */ +struct reiserfs_jh { +	struct reiserfs_journal_list *jl; +	struct buffer_head *bh; +	struct list_head list; +}; + +void reiserfs_free_jh(struct buffer_head *bh); +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh); +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh); +int journal_mark_dirty(struct reiserfs_transaction_handle *, +		       struct super_block *, struct buffer_head *bh); + +static inline int reiserfs_file_data_log(struct inode *inode) +{ +	if (reiserfs_data_log(inode->i_sb) || +	    (REISERFS_I(inode)->i_flags & i_data_log)) +		return 1; +	return 0; +} + +static inline int reiserfs_transaction_running(struct super_block *s) +{ +	struct reiserfs_transaction_handle *th = current->journal_info; +	if (th && th->t_super == s) +		return 1; +	if (th && th->t_super == NULL) +		BUG(); +	return 0; +} + +static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th) +{ +	return th->t_blocks_allocated - th->t_blocks_logged; +} + +struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct +								    super_block +								    *, +								    int count); +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *); +int reiserfs_commit_page(struct inode *inode, struct page *page, +			 unsigned from, unsigned to); +int reiserfs_flush_old_commits(struct super_block *); +int reiserfs_commit_for_inode(struct inode *); +int reiserfs_inode_needs_commit(struct inode *); +void reiserfs_update_inode_transaction(struct inode *); +void reiserfs_wait_on_write_block(struct super_block *s); +void reiserfs_block_writes(struct reiserfs_transaction_handle *th); +void reiserfs_allow_writes(struct super_block *s); +void reiserfs_check_lock_depth(struct super_block *s, char *caller); +int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, +				 int wait); +void reiserfs_restore_prepared_buffer(struct super_block *, +				      struct buffer_head *bh); +int journal_init(struct super_block *, const char *j_dev_name, int old_format, +		 unsigned int); +int journal_release(struct reiserfs_transaction_handle *, struct super_block *); +int journal_release_error(struct reiserfs_transaction_handle *, +			  struct super_block *); +int journal_end(struct reiserfs_transaction_handle *, struct super_block *, +		unsigned long); +int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, +		     unsigned long); +int journal_mark_freed(struct reiserfs_transaction_handle *, +		       struct super_block *, b_blocknr_t blocknr); +int journal_transaction_should_end(struct reiserfs_transaction_handle *, int); +int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr, +			 int bit_nr, int searchall, b_blocknr_t *next); +int journal_begin(struct reiserfs_transaction_handle *, +		  struct super_block *sb, unsigned long); +int journal_join_abort(struct reiserfs_transaction_handle *, +		       struct super_block *sb, unsigned long); +void reiserfs_abort_journal(struct super_block *sb, int errno); +void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...); +int reiserfs_allocate_list_bitmaps(struct super_block *s, +				   struct reiserfs_list_bitmap *, unsigned int); + +void add_save_link(struct reiserfs_transaction_handle *th, +		   struct inode *inode, int truncate); +int remove_save_link(struct inode *inode, int truncate); + +/* objectid.c */ +__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th); +void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, +			       __u32 objectid_to_release); +int reiserfs_convert_objectid_map_v1(struct super_block *); + +/* stree.c */ +int B_IS_IN_TREE(const struct buffer_head *); +extern void copy_item_head(struct item_head *to, +			   const struct item_head *from); + +// first key is in cpu form, second - le +extern int comp_short_keys(const struct reiserfs_key *le_key, +			   const struct cpu_key *cpu_key); +extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from); + +// both are in le form +extern int comp_le_keys(const struct reiserfs_key *, +			const struct reiserfs_key *); +extern int comp_short_le_keys(const struct reiserfs_key *, +			      const struct reiserfs_key *); + +// +// get key version from on disk key - kludge +// +static inline int le_key_version(const struct reiserfs_key *key) +{ +	int type; + +	type = offset_v2_k_type(&(key->u.k_offset_v2)); +	if (type != TYPE_DIRECT && type != TYPE_INDIRECT +	    && type != TYPE_DIRENTRY) +		return KEY_FORMAT_3_5; + +	return KEY_FORMAT_3_6; + +} + +static inline void copy_key(struct reiserfs_key *to, +			    const struct reiserfs_key *from) +{ +	memcpy(to, from, KEY_SIZE); +} + +int comp_items(const struct item_head *stored_ih, const struct treepath *path); +const struct reiserfs_key *get_rkey(const struct treepath *chk_path, +				    const struct super_block *sb); +int search_by_key(struct super_block *, const struct cpu_key *, +		  struct treepath *, int); +#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL) +int search_for_position_by_key(struct super_block *sb, +			       const struct cpu_key *cpu_key, +			       struct treepath *search_path); +extern void decrement_bcount(struct buffer_head *bh); +void decrement_counters_in_path(struct treepath *search_path); +void pathrelse(struct treepath *search_path); +int reiserfs_check_path(struct treepath *p); +void pathrelse_and_restore(struct super_block *s, struct treepath *search_path); + +int reiserfs_insert_item(struct reiserfs_transaction_handle *th, +			 struct treepath *path, +			 const struct cpu_key *key, +			 struct item_head *ih, +			 struct inode *inode, const char *body); + +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, +			     struct treepath *path, +			     const struct cpu_key *key, +			     struct inode *inode, +			     const char *body, int paste_size); + +int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, +			   struct treepath *path, +			   struct cpu_key *key, +			   struct inode *inode, +			   struct page *page, loff_t new_file_size); + +int reiserfs_delete_item(struct reiserfs_transaction_handle *th, +			 struct treepath *path, +			 const struct cpu_key *key, +			 struct inode *inode, struct buffer_head *un_bh); + +void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, +				struct inode *inode, struct reiserfs_key *key); +int reiserfs_delete_object(struct reiserfs_transaction_handle *th, +			   struct inode *inode); +int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, +			 struct inode *inode, struct page *, +			 int update_timestamps); + +#define i_block_size(inode) ((inode)->i_sb->s_blocksize) +#define file_size(inode) ((inode)->i_size) +#define tail_size(inode) (file_size (inode) & (i_block_size (inode) - 1)) + +#define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\ +!STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 ) + +void padd_item(char *item, int total_length, int length); + +/* inode.c */ +/* args for the create parameter of reiserfs_get_block */ +#define GET_BLOCK_NO_CREATE 0	/* don't create new blocks or convert tails */ +#define GET_BLOCK_CREATE 1	/* add anything you need to find block */ +#define GET_BLOCK_NO_HOLE 2	/* return -ENOENT for file holes */ +#define GET_BLOCK_READ_DIRECT 4	/* read the tail if indirect item not found */ +#define GET_BLOCK_NO_IMUX     8	/* i_mutex is not held, don't preallocate */ +#define GET_BLOCK_NO_DANGLE   16	/* don't leave any transactions running */ + +void reiserfs_read_locked_inode(struct inode *inode, +				struct reiserfs_iget_args *args); +int reiserfs_find_actor(struct inode *inode, void *p); +int reiserfs_init_locked_inode(struct inode *inode, void *p); +void reiserfs_evict_inode(struct inode *inode); +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc); +int reiserfs_get_block(struct inode *inode, sector_t block, +		       struct buffer_head *bh_result, int create); +struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, +				     int fh_len, int fh_type); +struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, +				     int fh_len, int fh_type); +int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, +		       int connectable); + +int reiserfs_truncate_file(struct inode *, int update_timestamps); +void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset, +		  int type, int key_length); +void make_le_item_head(struct item_head *ih, const struct cpu_key *key, +		       int version, +		       loff_t offset, int type, int length, int entry_count); +struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key); + +struct reiserfs_security_handle; +int reiserfs_new_inode(struct reiserfs_transaction_handle *th, +		       struct inode *dir, umode_t mode, +		       const char *symname, loff_t i_size, +		       struct dentry *dentry, struct inode *inode, +		       struct reiserfs_security_handle *security); + +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, +			     struct inode *inode, loff_t size); + +static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, +				      struct inode *inode) +{ +	reiserfs_update_sd_size(th, inode, inode->i_size); +} + +void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); +void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs); +int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); + +int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); + +/* namei.c */ +void set_de_name_and_namelen(struct reiserfs_dir_entry *de); +int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, +			struct treepath *path, struct reiserfs_dir_entry *de); +struct dentry *reiserfs_get_parent(struct dentry *); + +#ifdef CONFIG_REISERFS_PROC_INFO +int reiserfs_proc_info_init(struct super_block *sb); +int reiserfs_proc_info_done(struct super_block *sb); +int reiserfs_proc_info_global_init(void); +int reiserfs_proc_info_global_done(void); + +#define PROC_EXP( e )   e + +#define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data +#define PROC_INFO_MAX( sb, field, value )								\ +    __PINFO( sb ).field =												\ +        max( REISERFS_SB( sb ) -> s_proc_info_data.field, value ) +#define PROC_INFO_INC( sb, field ) ( ++ ( __PINFO( sb ).field ) ) +#define PROC_INFO_ADD( sb, field, val ) ( __PINFO( sb ).field += ( val ) ) +#define PROC_INFO_BH_STAT( sb, bh, level )							\ +    PROC_INFO_INC( sb, sbk_read_at[ ( level ) ] );						\ +    PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) );	\ +    PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) ) +#else +static inline int reiserfs_proc_info_init(struct super_block *sb) +{ +	return 0; +} + +static inline int reiserfs_proc_info_done(struct super_block *sb) +{ +	return 0; +} + +static inline int reiserfs_proc_info_global_init(void) +{ +	return 0; +} + +static inline int reiserfs_proc_info_global_done(void) +{ +	return 0; +} + +#define PROC_EXP( e ) +#define VOID_V ( ( void ) 0 ) +#define PROC_INFO_MAX( sb, field, value ) VOID_V +#define PROC_INFO_INC( sb, field ) VOID_V +#define PROC_INFO_ADD( sb, field, val ) VOID_V +#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V +#endif + +/* dir.c */ +extern const struct inode_operations reiserfs_dir_inode_operations; +extern const struct inode_operations reiserfs_symlink_inode_operations; +extern const struct inode_operations reiserfs_special_inode_operations; +extern const struct file_operations reiserfs_dir_operations; +int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *); + +/* tail_conversion.c */ +int direct2indirect(struct reiserfs_transaction_handle *, struct inode *, +		    struct treepath *, struct buffer_head *, loff_t); +int indirect2direct(struct reiserfs_transaction_handle *, struct inode *, +		    struct page *, struct treepath *, const struct cpu_key *, +		    loff_t, char *); +void reiserfs_unmap_buffer(struct buffer_head *); + +/* file.c */ +extern const struct inode_operations reiserfs_file_inode_operations; +extern const struct file_operations reiserfs_file_operations; +extern const struct address_space_operations reiserfs_address_space_operations; + +/* fix_nodes.c */ + +int fix_nodes(int n_op_mode, struct tree_balance *tb, +	      struct item_head *ins_ih, const void *); +void unfix_nodes(struct tree_balance *); + +/* prints.c */ +void __reiserfs_panic(struct super_block *s, const char *id, +		      const char *function, const char *fmt, ...) +    __attribute__ ((noreturn)); +#define reiserfs_panic(s, id, fmt, args...) \ +	__reiserfs_panic(s, id, __func__, fmt, ##args) +void __reiserfs_error(struct super_block *s, const char *id, +		      const char *function, const char *fmt, ...); +#define reiserfs_error(s, id, fmt, args...) \ +	 __reiserfs_error(s, id, __func__, fmt, ##args) +void reiserfs_info(struct super_block *s, const char *fmt, ...); +void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...); +void print_indirect_item(struct buffer_head *bh, int item_num); +void store_print_tb(struct tree_balance *tb); +void print_cur_tb(char *mes); +void print_de(struct reiserfs_dir_entry *de); +void print_bi(struct buffer_info *bi, char *mes); +#define PRINT_LEAF_ITEMS 1	/* print all items */ +#define PRINT_DIRECTORY_ITEMS 2	/* print directory items */ +#define PRINT_DIRECT_ITEMS 4	/* print contents of direct items */ +void print_block(struct buffer_head *bh, ...); +void print_bmap(struct super_block *s, int silent); +void print_bmap_block(int i, char *data, int size, int silent); +/*void print_super_block (struct super_block * s, char * mes);*/ +void print_objectid_map(struct super_block *s); +void print_block_head(struct buffer_head *bh, char *mes); +void check_leaf(struct buffer_head *bh); +void check_internal(struct buffer_head *bh); +void print_statistics(struct super_block *s); +char *reiserfs_hashname(int code); + +/* lbalance.c */ +int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, +		    int mov_bytes, struct buffer_head *Snew); +int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes); +int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes); +void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first, +		       int del_num, int del_bytes); +void leaf_insert_into_buf(struct buffer_info *bi, int before, +			  struct item_head *inserted_item_ih, +			  const char *inserted_item_body, int zeros_number); +void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num, +			  int pos_in_item, int paste_size, const char *body, +			  int zeros_number); +void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, +			  int pos_in_item, int cut_size); +void leaf_paste_entries(struct buffer_info *bi, int item_num, int before, +			int new_entry_count, struct reiserfs_de_head *new_dehs, +			const char *records, int paste_size); +/* ibalance.c */ +int balance_internal(struct tree_balance *, int, int, struct item_head *, +		     struct buffer_head **); + +/* do_balance.c */ +void do_balance_mark_leaf_dirty(struct tree_balance *tb, +				struct buffer_head *bh, int flag); +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty + +void do_balance(struct tree_balance *tb, struct item_head *ih, +		const char *body, int flag); +void reiserfs_invalidate_buffer(struct tree_balance *tb, +				struct buffer_head *bh); + +int get_left_neighbor_position(struct tree_balance *tb, int h); +int get_right_neighbor_position(struct tree_balance *tb, int h); +void replace_key(struct tree_balance *tb, struct buffer_head *, int, +		 struct buffer_head *, int); +void make_empty_node(struct buffer_info *); +struct buffer_head *get_FEB(struct tree_balance *); + +/* bitmap.c */ + +/* structure contains hints for block allocator, and it is a container for + * arguments, such as node, search path, transaction_handle, etc. */ +struct __reiserfs_blocknr_hint { +	struct inode *inode;	/* inode passed to allocator, if we allocate unf. nodes */ +	sector_t block;		/* file offset, in blocks */ +	struct in_core_key key; +	struct treepath *path;	/* search path, used by allocator to deternine search_start by +				 * various ways */ +	struct reiserfs_transaction_handle *th;	/* transaction handle is needed to log super blocks and +						 * bitmap blocks changes  */ +	b_blocknr_t beg, end; +	b_blocknr_t search_start;	/* a field used to transfer search start value (block number) +					 * between different block allocator procedures +					 * (determine_search_start() and others) */ +	int prealloc_size;	/* is set in determine_prealloc_size() function, used by underlayed +				 * function that do actual allocation */ + +	unsigned formatted_node:1;	/* the allocator uses different polices for getting disk space for +					 * formatted/unformatted blocks with/without preallocation */ +	unsigned preallocate:1; +}; + +typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t; + +int reiserfs_parse_alloc_options(struct super_block *, char *); +void reiserfs_init_alloc_options(struct super_block *s); + +/* + * given a directory, this will tell you what packing locality + * to use for a new object underneat it.  The locality is returned + * in disk byte order (le). + */ +__le32 reiserfs_choose_packing(struct inode *dir); + +int reiserfs_init_bitmap_cache(struct super_block *sb); +void reiserfs_free_bitmap_cache(struct super_block *sb); +void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info); +struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap); +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value); +void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *, +			 b_blocknr_t, int for_unformatted); +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int, +			       int); +static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb, +					     b_blocknr_t * new_blocknrs, +					     int amount_needed) +{ +	reiserfs_blocknr_hint_t hint = { +		.th = tb->transaction_handle, +		.path = tb->tb_path, +		.inode = NULL, +		.key = tb->key, +		.block = 0, +		.formatted_node = 1 +	}; +	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed, +					  0); +} + +static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle +					    *th, struct inode *inode, +					    b_blocknr_t * new_blocknrs, +					    struct treepath *path, +					    sector_t block) +{ +	reiserfs_blocknr_hint_t hint = { +		.th = th, +		.path = path, +		.inode = inode, +		.block = block, +		.formatted_node = 0, +		.preallocate = 0 +	}; +	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0); +} + +#ifdef REISERFS_PREALLOCATE +static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle +					     *th, struct inode *inode, +					     b_blocknr_t * new_blocknrs, +					     struct treepath *path, +					     sector_t block) +{ +	reiserfs_blocknr_hint_t hint = { +		.th = th, +		.path = path, +		.inode = inode, +		.block = block, +		.formatted_node = 0, +		.preallocate = 1 +	}; +	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0); +} + +void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, +			       struct inode *inode); +void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th); +#endif + +/* hashes.c */ +__u32 keyed_hash(const signed char *msg, int len); +__u32 yura_hash(const signed char *msg, int len); +__u32 r5_hash(const signed char *msg, int len); + +#define reiserfs_set_le_bit		__set_bit_le +#define reiserfs_test_and_set_le_bit	__test_and_set_bit_le +#define reiserfs_clear_le_bit		__clear_bit_le +#define reiserfs_test_and_clear_le_bit	__test_and_clear_bit_le +#define reiserfs_test_le_bit		test_bit_le +#define reiserfs_find_next_zero_le_bit	find_next_zero_bit_le + +/* sometimes reiserfs_truncate may require to allocate few new blocks +   to perform indirect2direct conversion. People probably used to +   think, that truncate should work without problems on a filesystem +   without free disk space. They may complain that they can not +   truncate due to lack of free disk space. This spare space allows us +   to not worry about it. 500 is probably too much, but it should be +   absolutely safe */ +#define SPARE_SPACE 500 + +/* prototypes from ioctl.c */ +long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long reiserfs_compat_ioctl(struct file *filp, +		   unsigned int cmd, unsigned long arg); +int reiserfs_unpack(struct inode *inode, struct file *filp); diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 7483279b482..9a17f63c3fd 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -13,8 +13,7 @@  #include <linux/vmalloc.h>  #include <linux/string.h>  #include <linux/errno.h> -#include <linux/reiserfs_fs.h> -#include <linux/reiserfs_fs_sb.h> +#include "reiserfs.h"  #include <linux/buffer_head.h>  int reiserfs_resize(struct super_block *s, unsigned long block_count_new) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 313d39d639e..f8afa4b162b 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -51,7 +51,7 @@  #include <linux/time.h>  #include <linux/string.h>  #include <linux/pagemap.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/buffer_head.h>  #include <linux/quotaops.h> @@ -1284,12 +1284,12 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,  		 ** -clm  		 */ -		data = kmap_atomic(un_bh->b_page, KM_USER0); +		data = kmap_atomic(un_bh->b_page);  		off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));  		memcpy(data + off,  		       B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih),  		       ret_value); -		kunmap_atomic(data, KM_USER0); +		kunmap_atomic(data);  	}  	/* Perform balancing after all resources have been collected at once. */  	do_balance(&s_del_balance, NULL, NULL, M_DELETE); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index e12d8b97cd4..8b7616ef06d 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -16,9 +16,9 @@  #include <linux/vmalloc.h>  #include <linux/time.h>  #include <asm/uaccess.h> -#include <linux/reiserfs_fs.h> -#include <linux/reiserfs_acl.h> -#include <linux/reiserfs_xattr.h> +#include "reiserfs.h" +#include "acl.h" +#include "xattr.h"  #include <linux/init.h>  #include <linux/blkdev.h>  #include <linux/buffer_head.h> @@ -1874,11 +1874,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)  		unlock_new_inode(root_inode);  	} -	s->s_root = d_alloc_root(root_inode); -	if (!s->s_root) { -		iput(root_inode); +	s->s_root = d_make_root(root_inode); +	if (!s->s_root)  		goto error; -	}  	// define and initialize hash function  	sbi->s_hash_function = hash_function(s);  	if (sbi->s_hash_function == NULL) { diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index d7f6e51bef2..5e2624d12f7 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -5,7 +5,7 @@  #include <linux/time.h>  #include <linux/pagemap.h>  #include <linux/buffer_head.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  /* access to tail : when one is going to read tail it must make sure, that is not running.   direct2indirect and indirect2direct can not run concurrently */ @@ -128,9 +128,9 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,  	if (up_to_date_bh) {  		unsigned pgoff =  		    (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1); -		char *kaddr = kmap_atomic(up_to_date_bh->b_page, KM_USER0); +		char *kaddr = kmap_atomic(up_to_date_bh->b_page);  		memset(kaddr + pgoff, 0, blk_size - total_tail); -		kunmap_atomic(kaddr, KM_USER0); +		kunmap_atomic(kaddr);  	}  	REISERFS_I(inode)->i_first_direct_byte = U32_MAX; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index c24deda8a8b..46fc1c20a6b 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -33,7 +33,7 @@   * The xattrs themselves are protected by the xattr_sem.   */ -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/capability.h>  #include <linux/dcache.h>  #include <linux/namei.h> @@ -43,8 +43,8 @@  #include <linux/file.h>  #include <linux/pagemap.h>  #include <linux/xattr.h> -#include <linux/reiserfs_xattr.h> -#include <linux/reiserfs_acl.h> +#include "xattr.h" +#include "acl.h"  #include <asm/uaccess.h>  #include <net/checksum.h>  #include <linux/stat.h> diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h new file mode 100644 index 00000000000..f59626c5d33 --- /dev/null +++ b/fs/reiserfs/xattr.h @@ -0,0 +1,122 @@ +#include <linux/reiserfs_xattr.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/rwsem.h> + +struct inode; +struct dentry; +struct iattr; +struct super_block; +struct nameidata; + +int reiserfs_xattr_register_handlers(void) __init; +void reiserfs_xattr_unregister_handlers(void); +int reiserfs_xattr_init(struct super_block *sb, int mount_flags); +int reiserfs_lookup_privroot(struct super_block *sb); +int reiserfs_delete_xattrs(struct inode *inode); +int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); +int reiserfs_permission(struct inode *inode, int mask); + +#ifdef CONFIG_REISERFS_FS_XATTR +#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) +ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name, +			  void *buffer, size_t size); +int reiserfs_setxattr(struct dentry *dentry, const char *name, +		      const void *value, size_t size, int flags); +ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size); +int reiserfs_removexattr(struct dentry *dentry, const char *name); + +int reiserfs_xattr_get(struct inode *, const char *, void *, size_t); +int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int); +int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *, +			      struct inode *, const char *, const void *, +			      size_t, int); + +extern const struct xattr_handler reiserfs_xattr_user_handler; +extern const struct xattr_handler reiserfs_xattr_trusted_handler; +extern const struct xattr_handler reiserfs_xattr_security_handler; +#ifdef CONFIG_REISERFS_FS_SECURITY +int reiserfs_security_init(struct inode *dir, struct inode *inode, +			   const struct qstr *qstr, +			   struct reiserfs_security_handle *sec); +int reiserfs_security_write(struct reiserfs_transaction_handle *th, +			    struct inode *inode, +			    struct reiserfs_security_handle *sec); +void reiserfs_security_free(struct reiserfs_security_handle *sec); +#endif + +static inline int reiserfs_xattrs_initialized(struct super_block *sb) +{ +	return REISERFS_SB(sb)->priv_root != NULL; +} + +#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header)) +static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size) +{ +	loff_t ret = 0; +	if (reiserfs_file_data_log(inode)) { +		ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize); +		ret >>= inode->i_sb->s_blocksize_bits; +	} +	return ret; +} + +/* We may have to create up to 3 objects: xattr root, xattr dir, xattr file. + * Let's try to be smart about it. + * xattr root: We cache it. If it's not cached, we may need to create it. + * xattr dir: If anything has been loaded for this inode, we can set a flag + *            saying so. + * xattr file: Since we don't cache xattrs, we can't tell. We always include + *             blocks for it. + * + * However, since root and dir can be created between calls - YOU MUST SAVE + * THIS VALUE. + */ +static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode) +{ +	size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); + +	if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) { +		nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); +		if (!REISERFS_SB(inode->i_sb)->xattr_root->d_inode) +			nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); +	} + +	return nblocks; +} + +static inline void reiserfs_init_xattr_rwsem(struct inode *inode) +{ +	init_rwsem(&REISERFS_I(inode)->i_xattr_sem); +} + +#else + +#define reiserfs_getxattr NULL +#define reiserfs_setxattr NULL +#define reiserfs_listxattr NULL +#define reiserfs_removexattr NULL + +static inline void reiserfs_init_xattr_rwsem(struct inode *inode) +{ +} +#endif  /*  CONFIG_REISERFS_FS_XATTR  */ + +#ifndef CONFIG_REISERFS_FS_SECURITY +static inline int reiserfs_security_init(struct inode *dir, +					 struct inode *inode, +					 const struct qstr *qstr, +					 struct reiserfs_security_handle *sec) +{ +	return 0; +} +static inline int +reiserfs_security_write(struct reiserfs_transaction_handle *th, +			struct inode *inode, +			struct reiserfs_security_handle *sec) +{ +	return 0; +} +static inline void reiserfs_security_free(struct reiserfs_security_handle *sec) +{} +#endif diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 6da0396e505..44474f9b990 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -1,14 +1,14 @@  #include <linux/capability.h>  #include <linux/fs.h>  #include <linux/posix_acl.h> -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/errno.h>  #include <linux/pagemap.h>  #include <linux/xattr.h>  #include <linux/slab.h>  #include <linux/posix_acl_xattr.h> -#include <linux/reiserfs_xattr.h> -#include <linux/reiserfs_acl.h> +#include "xattr.h" +#include "acl.h"  #include <asm/uaccess.h>  static int reiserfs_set_acl(struct reiserfs_transaction_handle *th, diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 534668fa41b..800a3cef6f6 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -1,10 +1,10 @@ -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/errno.h>  #include <linux/fs.h>  #include <linux/pagemap.h>  #include <linux/xattr.h>  #include <linux/slab.h> -#include <linux/reiserfs_xattr.h> +#include "xattr.h"  #include <linux/security.h>  #include <asm/uaccess.h> diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 9883736ce3e..a0035719f66 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -1,10 +1,10 @@ -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/capability.h>  #include <linux/errno.h>  #include <linux/fs.h>  #include <linux/pagemap.h>  #include <linux/xattr.h> -#include <linux/reiserfs_xattr.h> +#include "xattr.h"  #include <asm/uaccess.h>  static int diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 45ae1a00013..8667491ae7c 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -1,9 +1,9 @@ -#include <linux/reiserfs_fs.h> +#include "reiserfs.h"  #include <linux/errno.h>  #include <linux/fs.h>  #include <linux/pagemap.h>  #include <linux/xattr.h> -#include <linux/reiserfs_xattr.h> +#include "xattr.h"  #include <asm/uaccess.h>  static int diff --git a/fs/romfs/super.c b/fs/romfs/super.c index bb36ab74eb4..e64f6b5f7ae 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -538,14 +538,12 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)  	if (IS_ERR(root))  		goto error; -	sb->s_root = d_alloc_root(root); +	sb->s_root = d_make_root(root);  	if (!sb->s_root) -		goto error_i; +		goto error;  	return 0; -error_i: -	iput(root);  error:  	return -EINVAL;  error_rsb_inval: diff --git a/fs/select.c b/fs/select.c index d33418fdc85..6fb8943d580 100644 --- a/fs/select.c +++ b/fs/select.c @@ -17,7 +17,7 @@  #include <linux/kernel.h>  #include <linux/sched.h>  #include <linux/syscalls.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/slab.h>  #include <linux/poll.h>  #include <linux/personality.h> /* for STICKY_TIMEOUTS */ @@ -223,7 +223,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,  	get_file(filp);  	entry->filp = filp;  	entry->wait_address = wait_address; -	entry->key = p->key; +	entry->key = p->_key;  	init_waitqueue_func_entry(&entry->wait, pollwake);  	entry->wait.private = pwq;  	add_wait_queue(wait_address, &entry->wait); @@ -386,13 +386,11 @@ get_max:  static inline void wait_key_set(poll_table *wait, unsigned long in,  				unsigned long out, unsigned long bit)  { -	if (wait) { -		wait->key = POLLEX_SET; -		if (in & bit) -			wait->key |= POLLIN_SET; -		if (out & bit) -			wait->key |= POLLOUT_SET; -	} +	wait->_key = POLLEX_SET; +	if (in & bit) +		wait->_key |= POLLIN_SET; +	if (out & bit) +		wait->_key |= POLLOUT_SET;  }  int do_select(int n, fd_set_bits *fds, struct timespec *end_time) @@ -414,7 +412,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)  	poll_initwait(&table);  	wait = &table.pt;  	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { -		wait = NULL; +		wait->_qproc = NULL;  		timed_out = 1;  	} @@ -459,17 +457,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)  					if ((mask & POLLIN_SET) && (in & bit)) {  						res_in |= bit;  						retval++; -						wait = NULL; +						wait->_qproc = NULL;  					}  					if ((mask & POLLOUT_SET) && (out & bit)) {  						res_out |= bit;  						retval++; -						wait = NULL; +						wait->_qproc = NULL;  					}  					if ((mask & POLLEX_SET) && (ex & bit)) {  						res_ex |= bit;  						retval++; -						wait = NULL; +						wait->_qproc = NULL;  					}  				}  			} @@ -481,7 +479,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)  				*rexp = res_ex;  			cond_resched();  		} -		wait = NULL; +		wait->_qproc = NULL;  		if (retval || timed_out || signal_pending(current))  			break;  		if (table.error) { @@ -720,7 +718,7 @@ struct poll_list {   * interested in events matching the pollfd->events mask, and the result   * matching that mask is both recorded in pollfd->revents and returned. The   * pwait poll_table will be used by the fd-provided poll handler for waiting, - * if non-NULL. + * if pwait->_qproc is non-NULL.   */  static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)  { @@ -738,9 +736,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)  		if (file != NULL) {  			mask = DEFAULT_POLLMASK;  			if (file->f_op && file->f_op->poll) { -				if (pwait) -					pwait->key = pollfd->events | -							POLLERR | POLLHUP; +				pwait->_key = pollfd->events|POLLERR|POLLHUP;  				mask = file->f_op->poll(file, pwait);  			}  			/* Mask out unneeded events. */ @@ -763,7 +759,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,  	/* Optimise the no-wait case */  	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { -		pt = NULL; +		pt->_qproc = NULL;  		timed_out = 1;  	} @@ -781,22 +777,22 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,  			for (; pfd != pfd_end; pfd++) {  				/*  				 * Fish for events. If we found one, record it -				 * and kill the poll_table, so we don't +				 * and kill poll_table->_qproc, so we don't  				 * needlessly register any other waiters after  				 * this. They'll get immediately deregistered  				 * when we break out and return.  				 */  				if (do_pollfd(pfd, pt)) {  					count++; -					pt = NULL; +					pt->_qproc = NULL;  				}  			}  		}  		/*  		 * All waiters have already been registered, so don't provide -		 * a poll_table to them on the next loop iteration. +		 * a poll_table->_qproc to them on the next loop iteration.  		 */ -		pt = NULL; +		pt->_qproc = NULL;  		if (!count) {  			count = wait->error;  			if (signal_pending(current)) @@ -912,7 +908,7 @@ static long do_restart_poll(struct restart_block *restart_block)  }  SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, -		long, timeout_msecs) +		int, timeout_msecs)  {  	struct timespec end_time, *to = NULL;  	int ret; diff --git a/fs/seq_file.c b/fs/seq_file.c index 4023d6be939..0cbd0494b79 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -6,13 +6,29 @@   */  #include <linux/fs.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/seq_file.h>  #include <linux/slab.h>  #include <asm/uaccess.h>  #include <asm/page.h> + +/* + * seq_files have a buffer which can may overflow. When this happens a larger + * buffer is reallocated and all the data will be printed again. + * The overflow state is true when m->count == m->size. + */ +static bool seq_overflow(struct seq_file *m) +{ +	return m->count == m->size; +} + +static void seq_set_overflow(struct seq_file *m) +{ +	m->count = m->size; +} +  /**   *	seq_open -	initialize sequential file   *	@file: file we initialize @@ -92,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset)  			error = 0;  			m->count = 0;  		} -		if (m->count == m->size) +		if (seq_overflow(m))  			goto Eoverflow;  		if (pos + m->count > offset) {  			m->from = offset - pos; @@ -140,9 +156,21 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)  	mutex_lock(&m->lock); +	/* +	 * seq_file->op->..m_start/m_stop/m_next may do special actions +	 * or optimisations based on the file->f_version, so we want to +	 * pass the file->f_version to those methods. +	 * +	 * seq_file->version is just copy of f_version, and seq_file +	 * methods can treat it simply as file version. +	 * It is copied in first and copied out after all operations. +	 * It is convenient to have it as  part of structure to avoid the +	 * need of passing another argument to all the seq_file methods. +	 */ +	m->version = file->f_version; +  	/* Don't assume *ppos is where we left it */  	if (unlikely(*ppos != m->read_pos)) { -		m->read_pos = *ppos;  		while ((err = traverse(m, *ppos)) == -EAGAIN)  			;  		if (err) { @@ -152,21 +180,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)  			m->index = 0;  			m->count = 0;  			goto Done; +		} else { +			m->read_pos = *ppos;  		}  	} -	/* -	 * seq_file->op->..m_start/m_stop/m_next may do special actions -	 * or optimisations based on the file->f_version, so we want to -	 * pass the file->f_version to those methods. -	 * -	 * seq_file->version is just copy of f_version, and seq_file -	 * methods can treat it simply as file version. -	 * It is copied in first and copied out after all operations. -	 * It is convenient to have it as  part of structure to avoid the -	 * need of passing another argument to all the seq_file methods. -	 */ -	m->version = file->f_version;  	/* grab buffer if we didn't have one */  	if (!m->buf) {  		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); @@ -232,7 +250,7 @@ Fill:  			break;  		}  		err = m->op->show(m, p); -		if (m->count == m->size || err) { +		if (seq_overflow(m) || err) {  			m->count = offs;  			if (likely(err <= 0))  				break; @@ -359,7 +377,7 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc)  			*p++ = '0' + (c & 07);  			continue;  		} -		m->count = m->size; +		seq_set_overflow(m);  		return -1;          }  	m->count = p - m->buf; @@ -381,7 +399,7 @@ int seq_printf(struct seq_file *m, const char *f, ...)  			return 0;  		}  	} -	m->count = m->size; +	seq_set_overflow(m);  	return -1;  }  EXPORT_SYMBOL(seq_printf); @@ -510,7 +528,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits,  			return 0;  		}  	} -	m->count = m->size; +	seq_set_overflow(m);  	return -1;  }  EXPORT_SYMBOL(seq_bitmap); @@ -526,7 +544,7 @@ int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,  			return 0;  		}  	} -	m->count = m->size; +	seq_set_overflow(m);  	return -1;  }  EXPORT_SYMBOL(seq_bitmap_list); @@ -637,11 +655,63 @@ int seq_puts(struct seq_file *m, const char *s)  		m->count += len;  		return 0;  	} -	m->count = m->size; +	seq_set_overflow(m);  	return -1;  }  EXPORT_SYMBOL(seq_puts); +/* + * A helper routine for putting decimal numbers without rich format of printf(). + * only 'unsigned long long' is supported. + * This routine will put one byte delimiter + number into seq_file. + * This routine is very quick when you show lots of numbers. + * In usual cases, it will be better to use seq_printf(). It's easier to read. + */ +int seq_put_decimal_ull(struct seq_file *m, char delimiter, +			unsigned long long num) +{ +	int len; + +	if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ +		goto overflow; + +	if (delimiter) +		m->buf[m->count++] = delimiter; + +	if (num < 10) { +		m->buf[m->count++] = num + '0'; +		return 0; +	} + +	len = num_to_str(m->buf + m->count, m->size - m->count, num); +	if (!len) +		goto overflow; +	m->count += len; +	return 0; +overflow: +	seq_set_overflow(m); +	return -1; +} +EXPORT_SYMBOL(seq_put_decimal_ull); + +int seq_put_decimal_ll(struct seq_file *m, char delimiter, +			long long num) +{ +	if (num < 0) { +		if (m->count + 3 >= m->size) { +			seq_set_overflow(m); +			return -1; +		} +		if (delimiter) +			m->buf[m->count++] = delimiter; +		num = -num; +		delimiter = '-'; +	} +	return seq_put_decimal_ull(m, delimiter, num); + +} +EXPORT_SYMBOL(seq_put_decimal_ll); +  /**   * seq_write - write arbitrary data to buffer   * @seq: seq_file identifying the buffer to which data should be written @@ -657,7 +727,7 @@ int seq_write(struct seq_file *seq, const void *data, size_t len)  		seq->count += len;  		return 0;  	} -	seq->count = seq->size; +	seq_set_overflow(seq);  	return -1;  }  EXPORT_SYMBOL(seq_write); diff --git a/fs/signalfd.c b/fs/signalfd.c index 492465b451d..7ae2a574cb2 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,21 @@  #include <linux/signalfd.h>  #include <linux/syscalls.h> +void signalfd_cleanup(struct sighand_struct *sighand) +{ +	wait_queue_head_t *wqh = &sighand->signalfd_wqh; +	/* +	 * The lockless check can race with remove_wait_queue() in progress, +	 * but in this case its caller should run under rcu_read_lock() and +	 * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. +	 */ +	if (likely(!waitqueue_active(wqh))) +		return; + +	/* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ +	wake_up_poll(wqh, POLLHUP | POLLFREE); +} +  struct signalfd_ctx {  	sigset_t sigmask;  }; diff --git a/fs/splice.c b/fs/splice.c index 1ec0493266b..5f883de7ef3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -25,7 +25,7 @@  #include <linux/mm_inline.h>  #include <linux/swap.h>  #include <linux/writeback.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/syscalls.h>  #include <linux/uio.h>  #include <linux/security.h> @@ -737,15 +737,12 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,  		goto out;  	if (buf->page != page) { -		/* -		 * Careful, ->map() uses KM_USER0! -		 */  		char *src = buf->ops->map(pipe, buf, 1); -		char *dst = kmap_atomic(page, KM_USER1); +		char *dst = kmap_atomic(page);  		memcpy(dst + offset, src + buf->offset, this_len);  		flush_dcache_page(page); -		kunmap_atomic(dst, KM_USER1); +		kunmap_atomic(dst);  		buf->ops->unmap(pipe, buf, src);  	}  	ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 38bb1c64055..8ca62c28fe1 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -464,10 +464,10 @@ static int squashfs_readpage(struct file *file, struct page *page)  		if (PageUptodate(push_page))  			goto skip_page; -		pageaddr = kmap_atomic(push_page, KM_USER0); +		pageaddr = kmap_atomic(push_page);  		squashfs_copy_data(pageaddr, buffer, offset, avail);  		memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); -		kunmap_atomic(pageaddr, KM_USER0); +		kunmap_atomic(pageaddr);  		flush_dcache_page(push_page);  		SetPageUptodate(push_page);  skip_page: @@ -484,9 +484,9 @@ skip_page:  error_out:  	SetPageError(page);  out: -	pageaddr = kmap_atomic(page, KM_USER0); +	pageaddr = kmap_atomic(page);  	memset(pageaddr, 0, PAGE_CACHE_SIZE); -	kunmap_atomic(pageaddr, KM_USER0); +	kunmap_atomic(pageaddr);  	flush_dcache_page(page);  	if (!PageError(page))  		SetPageUptodate(page); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index ecaa2f7bdb8..970b1167e7c 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -316,11 +316,10 @@ check_directory_table:  	}  	insert_inode_hash(root); -	sb->s_root = d_alloc_root(root); +	sb->s_root = d_make_root(root);  	if (sb->s_root == NULL) {  		ERROR("Root inode create failed\n");  		err = -ENOMEM; -		iput(root);  		goto failed_mount;  	} diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index 1191817264c..12806dffb34 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -90,14 +90,14 @@ static int squashfs_symlink_readpage(struct file *file, struct page *page)  			goto error_out;  		} -		pageaddr = kmap_atomic(page, KM_USER0); +		pageaddr = kmap_atomic(page);  		copied = squashfs_copy_data(pageaddr + bytes, entry, offset,  								length - bytes);  		if (copied == length - bytes)  			memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);  		else  			block = entry->next_index; -		kunmap_atomic(pageaddr, KM_USER0); +		kunmap_atomic(pageaddr);  		squashfs_cache_put(entry);  	} diff --git a/fs/stack.c b/fs/stack.c index 9c11519245a..5b5388250e2 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -1,4 +1,4 @@ -#include <linux/module.h> +#include <linux/export.h>  #include <linux/fs.h>  #include <linux/fs_stack.h> diff --git a/fs/stat.c b/fs/stat.c index 8806b8997d2..c733dc5753a 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -4,7 +4,7 @@   *  Copyright (C) 1991, 1992  Linus Torvalds   */ -#include <linux/module.h> +#include <linux/export.h>  #include <linux/mm.h>  #include <linux/errno.h>  #include <linux/file.h> @@ -307,7 +307,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,  		if (inode->i_op->readlink) {  			error = security_inode_readlink(path.dentry);  			if (!error) { -				touch_atime(path.mnt, path.dentry); +				touch_atime(&path);  				error = inode->i_op->readlink(path.dentry,  							      buf, bufsiz);  			} diff --git a/fs/statfs.c b/fs/statfs.c index 2aa6a22e0be..43e6b6fe4e8 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -1,5 +1,5 @@  #include <linux/syscalls.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/fs.h>  #include <linux/file.h>  #include <linux/mount.h> diff --git a/fs/super.c b/fs/super.c index 6015c02296b..cf001775617 100644 --- a/fs/super.c +++ b/fs/super.c @@ -20,7 +20,7 @@   *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000   */ -#include <linux/module.h> +#include <linux/export.h>  #include <linux/slab.h>  #include <linux/acct.h>  #include <linux/blkdev.h> @@ -32,6 +32,7 @@  #include <linux/backing-dev.h>  #include <linux/rculist_bl.h>  #include <linux/cleancache.h> +#include <linux/fsnotify.h>  #include "internal.h" @@ -250,7 +251,7 @@ void deactivate_locked_super(struct super_block *s)  {  	struct file_system_type *fs = s->s_type;  	if (atomic_dec_and_test(&s->s_active)) { -		cleancache_flush_fs(s); +		cleancache_invalidate_fs(s);  		fs->kill_sb(s);  		/* caches are now gone, we can safely kill the shrinker now */ @@ -634,6 +635,28 @@ rescan:  EXPORT_SYMBOL(get_super);  /** + *	get_super_thawed - get thawed superblock of a device + *	@bdev: device to get the superblock for + * + *	Scans the superblock list and finds the superblock of the file system + *	mounted on the device. The superblock is returned once it is thawed + *	(or immediately if it was not frozen). %NULL is returned if no match + *	is found. + */ +struct super_block *get_super_thawed(struct block_device *bdev) +{ +	while (1) { +		struct super_block *s = get_super(bdev); +		if (!s || s->s_frozen == SB_UNFROZEN) +			return s; +		up_read(&s->s_umount); +		vfs_check_frozen(s, SB_FREEZE_WRITE); +		put_super(s); +	} +} +EXPORT_SYMBOL(get_super_thawed); + +/**   * get_active_super - get an active reference to the superblock of a device   * @bdev: device to get the superblock for   * diff --git a/fs/sync.c b/fs/sync.c index f3501ef3923..0e8db939d96 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -6,7 +6,7 @@  #include <linux/file.h>  #include <linux/fs.h>  #include <linux/slab.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/namei.h>  #include <linux/sched.h>  #include <linux/writeback.h> diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 7fdf6a7b743..2a7a3f5d1ca 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -22,76 +22,103 @@  #include <linux/mutex.h>  #include <linux/slab.h>  #include <linux/security.h> +#include <linux/hash.h>  #include "sysfs.h"  DEFINE_MUTEX(sysfs_mutex);  DEFINE_SPINLOCK(sysfs_assoc_lock); +#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb); +  static DEFINE_SPINLOCK(sysfs_ino_lock);  static DEFINE_IDA(sysfs_ino_ida);  /** - *	sysfs_link_sibling - link sysfs_dirent into sibling list + *	sysfs_name_hash + *	@ns:   Namespace tag to hash + *	@name: Null terminated string to hash + * + *	Returns 31 bit hash of ns + name (so it fits in an off_t ) + */ +static unsigned int sysfs_name_hash(const void *ns, const char *name) +{ +	unsigned long hash = init_name_hash(); +	unsigned int len = strlen(name); +	while (len--) +		hash = partial_name_hash(*name++, hash); +	hash = ( end_name_hash(hash) ^ hash_ptr( (void *)ns, 31 ) ); +	hash &= 0x7fffffffU; +	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ +	if (hash < 1) +		hash += 2; +	if (hash >= INT_MAX) +		hash = INT_MAX - 1; +	return hash; +} + +static int sysfs_name_compare(unsigned int hash, const void *ns, +	const char *name, const struct sysfs_dirent *sd) +{ +	if (hash != sd->s_hash) +		return hash - sd->s_hash; +	if (ns != sd->s_ns) +		return ns - sd->s_ns; +	return strcmp(name, sd->s_name); +} + +static int sysfs_sd_compare(const struct sysfs_dirent *left, +			    const struct sysfs_dirent *right) +{ +	return sysfs_name_compare(left->s_hash, left->s_ns, left->s_name, +				  right); +} + +/** + *	sysfs_link_subling - link sysfs_dirent into sibling rbtree   *	@sd: sysfs_dirent of interest   * - *	Link @sd into its sibling list which starts from + *	Link @sd into its sibling rbtree which starts from   *	sd->s_parent->s_dir.children.   *   *	Locking:   *	mutex_lock(sysfs_mutex) + * + *	RETURNS: + *	0 on susccess -EEXIST on failure.   */ -static void sysfs_link_sibling(struct sysfs_dirent *sd) +static int sysfs_link_sibling(struct sysfs_dirent *sd)  { -	struct sysfs_dirent *parent_sd = sd->s_parent; - -	struct rb_node **p; -	struct rb_node *parent; +	struct rb_node **node = &sd->s_parent->s_dir.children.rb_node; +	struct rb_node *parent = NULL;  	if (sysfs_type(sd) == SYSFS_DIR) -		parent_sd->s_dir.subdirs++; +		sd->s_parent->s_dir.subdirs++; -	p = &parent_sd->s_dir.inode_tree.rb_node; -	parent = NULL; -	while (*p) { -		parent = *p; -#define node	rb_entry(parent, struct sysfs_dirent, inode_node) -		if (sd->s_ino < node->s_ino) { -			p = &node->inode_node.rb_left; -		} else if (sd->s_ino > node->s_ino) { -			p = &node->inode_node.rb_right; -		} else { -			printk(KERN_CRIT "sysfs: inserting duplicate inode '%lx'\n", -			       (unsigned long) sd->s_ino); -			BUG(); -		} -#undef node -	} -	rb_link_node(&sd->inode_node, parent, p); -	rb_insert_color(&sd->inode_node, &parent_sd->s_dir.inode_tree); +	while (*node) { +		struct sysfs_dirent *pos; +		int result; -	p = &parent_sd->s_dir.name_tree.rb_node; -	parent = NULL; -	while (*p) { -		int c; -		parent = *p; -#define node	rb_entry(parent, struct sysfs_dirent, name_node) -		c = strcmp(sd->s_name, node->s_name); -		if (c < 0) { -			p = &node->name_node.rb_left; -		} else { -			p = &node->name_node.rb_right; -		} -#undef node +		pos = to_sysfs_dirent(*node); +		parent = *node; +		result = sysfs_sd_compare(sd, pos); +		if (result < 0) +			node = &pos->s_rb.rb_left; +		else if (result > 0) +			node = &pos->s_rb.rb_right; +		else +			return -EEXIST;  	} -	rb_link_node(&sd->name_node, parent, p); -	rb_insert_color(&sd->name_node, &parent_sd->s_dir.name_tree); +	/* add new node and rebalance the tree */ +	rb_link_node(&sd->s_rb, parent, node); +	rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children); +	return 0;  }  /** - *	sysfs_unlink_sibling - unlink sysfs_dirent from sibling list + *	sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree   *	@sd: sysfs_dirent of interest   * - *	Unlink @sd from its sibling list which starts from + *	Unlink @sd from its sibling rbtree which starts from   *	sd->s_parent->s_dir.children.   *   *	Locking: @@ -102,8 +129,7 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)  	if (sysfs_type(sd) == SYSFS_DIR)  		sd->s_parent->s_dir.subdirs--; -	rb_erase(&sd->inode_node, &sd->s_parent->s_dir.inode_tree); -	rb_erase(&sd->name_node, &sd->s_parent->s_dir.name_tree); +	rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);  }  /** @@ -198,7 +224,7 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)  	rwsem_release(&sd->dep_map, 1, _RET_IP_);  } -static int sysfs_alloc_ino(ino_t *pino) +static int sysfs_alloc_ino(unsigned int *pino)  {  	int ino, rc; @@ -217,7 +243,7 @@ static int sysfs_alloc_ino(ino_t *pino)  	return rc;  } -static void sysfs_free_ino(ino_t ino) +static void sysfs_free_ino(unsigned int ino)  {  	spin_lock(&sysfs_ino_lock);  	ida_remove(&sysfs_ino_ida, ino); @@ -402,6 +428,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,  int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)  {  	struct sysfs_inode_attrs *ps_iattr; +	int ret;  	if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {  		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", @@ -410,12 +437,12 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)  		return -EINVAL;  	} -	if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name)) -		return -EEXIST; - +	sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);  	sd->s_parent = sysfs_get(acxt->parent_sd); -	sysfs_link_sibling(sd); +	ret = sysfs_link_sibling(sd); +	if (ret) +		return ret;  	/* Update timestamps on the parent */  	ps_iattr = acxt->parent_sd->s_iattr; @@ -565,8 +592,8 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,  				       const void *ns,  				       const unsigned char *name)  { -	struct rb_node *p = parent_sd->s_dir.name_tree.rb_node; -	struct sysfs_dirent *found = NULL; +	struct rb_node *node = parent_sd->s_dir.children.rb_node; +	unsigned int hash;  	if (!!sysfs_ns_type(parent_sd) != !!ns) {  		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", @@ -575,33 +602,21 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,  		return NULL;  	} -	while (p) { -		int c; -#define node	rb_entry(p, struct sysfs_dirent, name_node) -		c = strcmp(name, node->s_name); -		if (c < 0) { -			p = node->name_node.rb_left; -		} else if (c > 0) { -			p = node->name_node.rb_right; -		} else { -			found = node; -			p = node->name_node.rb_left; -		} -#undef node -	} +	hash = sysfs_name_hash(ns, name); +	while (node) { +		struct sysfs_dirent *sd; +		int result; -	if (found) { -		while (found->s_ns != ns) { -			p = rb_next(&found->name_node); -			if (!p) -				return NULL; -			found = rb_entry(p, struct sysfs_dirent, name_node); -			if (strcmp(name, found->s_name)) -				return NULL; -		} +		sd = to_sysfs_dirent(node); +		result = sysfs_name_compare(hash, ns, name, sd); +		if (result < 0) +			node = node->rb_left; +		else if (result > 0) +			node = node->rb_right; +		else +			return sd;  	} - -	return found; +	return NULL;  }  /** @@ -804,9 +819,9 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)  	pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);  	sysfs_addrm_start(&acxt, dir_sd); -	pos = rb_first(&dir_sd->s_dir.inode_tree); +	pos = rb_first(&dir_sd->s_dir.children);  	while (pos) { -		struct sysfs_dirent *sd = rb_entry(pos, struct sysfs_dirent, inode_node); +		struct sysfs_dirent *sd = to_sysfs_dirent(pos);  		pos = rb_next(pos);  		if (sysfs_type(sd) != SYSFS_DIR)  			sysfs_remove_one(&acxt, sd); @@ -863,6 +878,7 @@ int sysfs_rename(struct sysfs_dirent *sd,  		dup_name = sd->s_name;  		sd->s_name = new_name; +		sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);  	}  	/* Move to the appropriate place in the appropriate directories rbtree. */ @@ -919,38 +935,36 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)  }  static struct sysfs_dirent *sysfs_dir_pos(const void *ns, -	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos) +	struct sysfs_dirent *parent_sd,	loff_t hash, struct sysfs_dirent *pos)  {  	if (pos) {  		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&  			pos->s_parent == parent_sd && -			ino == pos->s_ino; +			hash == pos->s_hash;  		sysfs_put(pos);  		if (!valid)  			pos = NULL;  	} -	if (!pos && (ino > 1) && (ino < INT_MAX)) { -		struct rb_node *p = parent_sd->s_dir.inode_tree.rb_node; -		while (p) { -#define node	rb_entry(p, struct sysfs_dirent, inode_node) -			if (ino < node->s_ino) { -				pos = node; -				p = node->inode_node.rb_left; -			} else if (ino > node->s_ino) { -				p = node->inode_node.rb_right; -			} else { -				pos = node; +	if (!pos && (hash > 1) && (hash < INT_MAX)) { +		struct rb_node *node = parent_sd->s_dir.children.rb_node; +		while (node) { +			pos = to_sysfs_dirent(node); + +			if (hash < pos->s_hash) +				node = node->rb_left; +			else if (hash > pos->s_hash) +				node = node->rb_right; +			else  				break; -			} -#undef node  		}  	} +	/* Skip over entries in the wrong namespace */  	while (pos && pos->s_ns != ns) { -		struct rb_node *p = rb_next(&pos->inode_node); -		if (!p) +		struct rb_node *node = rb_next(&pos->s_rb); +		if (!node)  			pos = NULL;  		else -			pos = rb_entry(p, struct sysfs_dirent, inode_node); +			pos = to_sysfs_dirent(node);  	}  	return pos;  } @@ -960,11 +974,11 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,  {  	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);  	if (pos) do { -		struct rb_node *p = rb_next(&pos->inode_node); -		if (!p) +		struct rb_node *node = rb_next(&pos->s_rb); +		if (!node)  			pos = NULL;  		else -			pos = rb_entry(p, struct sysfs_dirent, inode_node); +			pos = to_sysfs_dirent(node);  	} while (pos && pos->s_ns != ns);  	return pos;  } @@ -1006,7 +1020,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)  		len = strlen(name);  		ino = pos->s_ino;  		type = dt_type(pos); -		filp->f_pos = ino; +		filp->f_pos = pos->s_hash;  		filp->private_data = sysfs_get(pos);  		mutex_unlock(&sysfs_mutex); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 62f4fb37789..00012e31829 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -493,6 +493,12 @@ int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,  	const void *ns = NULL;  	int err; +	if (!dir_sd) { +		WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n", +			kobject_name(kobj)); +		return -ENOENT; +	} +  	err = 0;  	if (!sysfs_ns_type(dir_sd))  		goto out; diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 4a802b4a905..feb2d69396c 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -136,12 +136,13 @@ static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, u32 *sec  	void *old_secdata;  	size_t old_secdata_len; -	iattrs = sd->s_iattr; -	if (!iattrs) -		iattrs = sysfs_init_inode_attrs(sd); -	if (!iattrs) -		return -ENOMEM; +	if (!sd->s_iattr) { +		sd->s_iattr = sysfs_init_inode_attrs(sd); +		if (!sd->s_iattr) +			return -ENOMEM; +	} +	iattrs = sd->s_iattr;  	old_secdata = iattrs->ia_secdata;  	old_secdata_len = iattrs->ia_secdata_len; @@ -318,8 +319,11 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha  	struct sysfs_addrm_cxt acxt;  	struct sysfs_dirent *sd; -	if (!dir_sd) +	if (!dir_sd) { +		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n", +			name);  		return -ENOENT; +	}  	sysfs_addrm_start(&acxt, dir_sd); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index e34f0d99ea4..52c3bdb66a8 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -36,7 +36,7 @@ struct sysfs_dirent sysfs_root = {  	.s_name		= "",  	.s_count	= ATOMIC_INIT(1),  	.s_flags	= SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT), -	.s_mode		= S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, +	.s_mode		= S_IFDIR | S_IRUGO | S_IXUGO,  	.s_ino		= 1,  }; @@ -61,10 +61,9 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)  	}  	/* instantiate and link root dentry */ -	root = d_alloc_root(inode); +	root = d_make_root(inode);  	if (!root) {  		pr_debug("%s: could not get root dentry!\n",__func__); -		iput(inode);  		return -ENOMEM;  	}  	root->d_fsdata = &sysfs_root; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 7484a36ee67..661a9639570 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -20,9 +20,8 @@ struct sysfs_elem_dir {  	struct kobject		*kobj;  	unsigned long		subdirs; - -	struct rb_root		inode_tree; -	struct rb_root		name_tree; +	/* children rbtree starts here and goes through sd->s_rb */ +	struct rb_root		children;  };  struct sysfs_elem_symlink { @@ -62,8 +61,7 @@ struct sysfs_dirent {  	struct sysfs_dirent	*s_parent;  	const char		*s_name; -	struct rb_node		inode_node; -	struct rb_node		name_node; +	struct rb_node		s_rb;  	union {  		struct completion	*completion; @@ -71,6 +69,7 @@ struct sysfs_dirent {  	} u;  	const void		*s_ns; /* namespace tag */ +	unsigned int		s_hash; /* ns + name hash */  	union {  		struct sysfs_elem_dir		s_dir;  		struct sysfs_elem_symlink	s_symlink; @@ -78,9 +77,9 @@ struct sysfs_dirent {  		struct sysfs_elem_bin_attr	s_bin_attr;  	}; -	unsigned int		s_flags; +	unsigned short		s_flags;  	umode_t 		s_mode; -	ino_t			s_ino; +	unsigned int		s_ino;  	struct sysfs_inode_attrs *s_iattr;  }; @@ -95,11 +94,11 @@ struct sysfs_dirent {  #define SYSFS_ACTIVE_REF		(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)  /* identify any namespace tag on sysfs_dirents */ -#define SYSFS_NS_TYPE_MASK		0xff00 +#define SYSFS_NS_TYPE_MASK		0xf00  #define SYSFS_NS_TYPE_SHIFT		8  #define SYSFS_FLAG_MASK			~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK) -#define SYSFS_FLAG_REMOVED		0x020000 +#define SYSFS_FLAG_REMOVED		0x02000  static inline unsigned int sysfs_type(struct sysfs_dirent *sd)  { diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index b217797e621..d7466e29361 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -121,9 +121,6 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,  {  	struct inode *inode = old_dentry->d_inode; -	if (inode->i_nlink >= SYSV_SB(inode->i_sb)->s_link_max) -		return -EMLINK; -  	inode->i_ctime = CURRENT_TIME_SEC;  	inode_inc_link_count(inode);  	ihold(inode); @@ -134,10 +131,8 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,  static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode)  {  	struct inode * inode; -	int err = -EMLINK; +	int err; -	if (dir->i_nlink >= SYSV_SB(dir->i_sb)->s_link_max)  -		goto out;  	inode_inc_link_count(dir);  	inode = sysv_new_inode(dir, S_IFDIR|mode); @@ -251,11 +246,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,  			drop_nlink(new_inode);  		inode_dec_link_count(new_inode);  	} else { -		if (dir_de) { -			err = -EMLINK; -			if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max) -				goto out_dir; -		}  		err = sysv_add_link(new_dentry, old_inode);  		if (err)  			goto out_dir; diff --git a/fs/sysv/super.c b/fs/sysv/super.c index f60c196913e..7491c33b646 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -44,7 +44,7 @@ enum {  	JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60  }; -static void detected_xenix(struct sysv_sb_info *sbi) +static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links)  {  	struct buffer_head *bh1 = sbi->s_bh1;  	struct buffer_head *bh2 = sbi->s_bh2; @@ -59,7 +59,7 @@ static void detected_xenix(struct sysv_sb_info *sbi)  		sbd2 = (struct xenix_super_block *) (bh2->b_data - 512);  	} -	sbi->s_link_max = XENIX_LINK_MAX; +	*max_links = XENIX_LINK_MAX;  	sbi->s_fic_size = XENIX_NICINOD;  	sbi->s_flc_size = XENIX_NICFREE;  	sbi->s_sbd1 = (char *)sbd1; @@ -75,7 +75,7 @@ static void detected_xenix(struct sysv_sb_info *sbi)  	sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize);  } -static void detected_sysv4(struct sysv_sb_info *sbi) +static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links)  {  	struct sysv4_super_block * sbd;  	struct buffer_head *bh1 = sbi->s_bh1; @@ -86,7 +86,7 @@ static void detected_sysv4(struct sysv_sb_info *sbi)  	else  		sbd = (struct sysv4_super_block *) bh2->b_data; -	sbi->s_link_max = SYSV_LINK_MAX; +	*max_links = SYSV_LINK_MAX;  	sbi->s_fic_size = SYSV_NICINOD;  	sbi->s_flc_size = SYSV_NICFREE;  	sbi->s_sbd1 = (char *)sbd; @@ -103,7 +103,7 @@ static void detected_sysv4(struct sysv_sb_info *sbi)  	sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);  } -static void detected_sysv2(struct sysv_sb_info *sbi) +static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links)  {  	struct sysv2_super_block *sbd;  	struct buffer_head *bh1 = sbi->s_bh1; @@ -114,7 +114,7 @@ static void detected_sysv2(struct sysv_sb_info *sbi)  	else  		sbd = (struct sysv2_super_block *) bh2->b_data; -	sbi->s_link_max = SYSV_LINK_MAX; +	*max_links = SYSV_LINK_MAX;  	sbi->s_fic_size = SYSV_NICINOD;  	sbi->s_flc_size = SYSV_NICFREE;  	sbi->s_sbd1 = (char *)sbd; @@ -131,14 +131,14 @@ static void detected_sysv2(struct sysv_sb_info *sbi)  	sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);  } -static void detected_coherent(struct sysv_sb_info *sbi) +static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links)  {  	struct coh_super_block * sbd;  	struct buffer_head *bh1 = sbi->s_bh1;  	sbd = (struct coh_super_block *) bh1->b_data; -	sbi->s_link_max = COH_LINK_MAX; +	*max_links = COH_LINK_MAX;  	sbi->s_fic_size = COH_NICINOD;  	sbi->s_flc_size = COH_NICFREE;  	sbi->s_sbd1 = (char *)sbd; @@ -154,12 +154,12 @@ static void detected_coherent(struct sysv_sb_info *sbi)  	sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);  } -static void detected_v7(struct sysv_sb_info *sbi) +static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links)  {  	struct buffer_head *bh2 = sbi->s_bh2;  	struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data; -	sbi->s_link_max = V7_LINK_MAX; +	*max_links = V7_LINK_MAX;  	sbi->s_fic_size = V7_NICINOD;  	sbi->s_flc_size = V7_NICFREE;  	sbi->s_sbd1 = (char *)sbd; @@ -290,7 +290,7 @@ static char *flavour_names[] = {  	[FSTYPE_AFS]	= "AFS",  }; -static void (*flavour_setup[])(struct sysv_sb_info *) = { +static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = {  	[FSTYPE_XENIX]	= detected_xenix,  	[FSTYPE_SYSV4]	= detected_sysv4,  	[FSTYPE_SYSV2]	= detected_sysv2, @@ -310,7 +310,7 @@ static int complete_read_super(struct super_block *sb, int silent, int size)  	sbi->s_firstinodezone = 2; -	flavour_setup[sbi->s_type](sbi); +	flavour_setup[sbi->s_type](sbi, &sb->s_max_links);  	sbi->s_truncate = 1;  	sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; @@ -341,9 +341,8 @@ static int complete_read_super(struct super_block *sb, int silent, int size)  		printk("SysV FS: get root inode failed\n");  		return 0;  	} -	sb->s_root = d_alloc_root(root_inode); +	sb->s_root = d_make_root(root_inode);  	if (!sb->s_root) { -		iput(root_inode);  		printk("SysV FS: get root dentry failed\n");  		return 0;  	} diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 0e4b821c569..11b07672f6c 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -24,7 +24,6 @@ struct sysv_sb_info {  	char	       s_bytesex;	/* bytesex (le/be/pdp) */  	char	       s_truncate;	/* if 1: names > SYSV_NAMELEN chars are truncated */  					/* if 0: they are disallowed (ENAMETOOLONG) */ -	nlink_t        s_link_max;	/* max number of hard links to a file */  	unsigned int   s_inodes_per_block;	/* number of inodes per block */  	unsigned int   s_inodes_per_block_1;	/* inodes_per_block - 1 */  	unsigned int   s_inodes_per_block_bits;	/* log2(inodes_per_block) */ diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index f922cbacdb9..1934084e208 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -36,7 +36,7 @@  #ifdef CONFIG_UBIFS_FS_DEBUG -DEFINE_SPINLOCK(dbg_lock); +static DEFINE_SPINLOCK(dbg_lock);  static const char *get_key_fmt(int fmt)  { @@ -221,15 +221,15 @@ const char *dbg_jhead(int jhead)  static void dump_ch(const struct ubifs_ch *ch)  { -	printk(KERN_DEBUG "\tmagic          %#x\n", le32_to_cpu(ch->magic)); -	printk(KERN_DEBUG "\tcrc            %#x\n", le32_to_cpu(ch->crc)); -	printk(KERN_DEBUG "\tnode_type      %d (%s)\n", ch->node_type, +	printk(KERN_ERR "\tmagic          %#x\n", le32_to_cpu(ch->magic)); +	printk(KERN_ERR "\tcrc            %#x\n", le32_to_cpu(ch->crc)); +	printk(KERN_ERR "\tnode_type      %d (%s)\n", ch->node_type,  	       dbg_ntype(ch->node_type)); -	printk(KERN_DEBUG "\tgroup_type     %d (%s)\n", ch->group_type, +	printk(KERN_ERR "\tgroup_type     %d (%s)\n", ch->group_type,  	       dbg_gtype(ch->group_type)); -	printk(KERN_DEBUG "\tsqnum          %llu\n", +	printk(KERN_ERR "\tsqnum          %llu\n",  	       (unsigned long long)le64_to_cpu(ch->sqnum)); -	printk(KERN_DEBUG "\tlen            %u\n", le32_to_cpu(ch->len)); +	printk(KERN_ERR "\tlen            %u\n", le32_to_cpu(ch->len));  }  void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode) @@ -240,43 +240,43 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)  	struct ubifs_dent_node *dent, *pdent = NULL;  	int count = 2; -	printk(KERN_DEBUG "Dump in-memory inode:"); -	printk(KERN_DEBUG "\tinode          %lu\n", inode->i_ino); -	printk(KERN_DEBUG "\tsize           %llu\n", +	printk(KERN_ERR "Dump in-memory inode:"); +	printk(KERN_ERR "\tinode          %lu\n", inode->i_ino); +	printk(KERN_ERR "\tsize           %llu\n",  	       (unsigned long long)i_size_read(inode)); -	printk(KERN_DEBUG "\tnlink          %u\n", inode->i_nlink); -	printk(KERN_DEBUG "\tuid            %u\n", (unsigned int)inode->i_uid); -	printk(KERN_DEBUG "\tgid            %u\n", (unsigned int)inode->i_gid); -	printk(KERN_DEBUG "\tatime          %u.%u\n", +	printk(KERN_ERR "\tnlink          %u\n", inode->i_nlink); +	printk(KERN_ERR "\tuid            %u\n", (unsigned int)inode->i_uid); +	printk(KERN_ERR "\tgid            %u\n", (unsigned int)inode->i_gid); +	printk(KERN_ERR "\tatime          %u.%u\n",  	       (unsigned int)inode->i_atime.tv_sec,  	       (unsigned int)inode->i_atime.tv_nsec); -	printk(KERN_DEBUG "\tmtime          %u.%u\n", +	printk(KERN_ERR "\tmtime          %u.%u\n",  	       (unsigned int)inode->i_mtime.tv_sec,  	       (unsigned int)inode->i_mtime.tv_nsec); -	printk(KERN_DEBUG "\tctime          %u.%u\n", +	printk(KERN_ERR "\tctime          %u.%u\n",  	       (unsigned int)inode->i_ctime.tv_sec,  	       (unsigned int)inode->i_ctime.tv_nsec); -	printk(KERN_DEBUG "\tcreat_sqnum    %llu\n", ui->creat_sqnum); -	printk(KERN_DEBUG "\txattr_size     %u\n", ui->xattr_size); -	printk(KERN_DEBUG "\txattr_cnt      %u\n", ui->xattr_cnt); -	printk(KERN_DEBUG "\txattr_names    %u\n", ui->xattr_names); -	printk(KERN_DEBUG "\tdirty          %u\n", ui->dirty); -	printk(KERN_DEBUG "\txattr          %u\n", ui->xattr); -	printk(KERN_DEBUG "\tbulk_read      %u\n", ui->xattr); -	printk(KERN_DEBUG "\tsynced_i_size  %llu\n", +	printk(KERN_ERR "\tcreat_sqnum    %llu\n", ui->creat_sqnum); +	printk(KERN_ERR "\txattr_size     %u\n", ui->xattr_size); +	printk(KERN_ERR "\txattr_cnt      %u\n", ui->xattr_cnt); +	printk(KERN_ERR "\txattr_names    %u\n", ui->xattr_names); +	printk(KERN_ERR "\tdirty          %u\n", ui->dirty); +	printk(KERN_ERR "\txattr          %u\n", ui->xattr); +	printk(KERN_ERR "\tbulk_read      %u\n", ui->xattr); +	printk(KERN_ERR "\tsynced_i_size  %llu\n",  	       (unsigned long long)ui->synced_i_size); -	printk(KERN_DEBUG "\tui_size        %llu\n", +	printk(KERN_ERR "\tui_size        %llu\n",  	       (unsigned long long)ui->ui_size); -	printk(KERN_DEBUG "\tflags          %d\n", ui->flags); -	printk(KERN_DEBUG "\tcompr_type     %d\n", ui->compr_type); -	printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read); -	printk(KERN_DEBUG "\tread_in_a_row  %lu\n", ui->read_in_a_row); -	printk(KERN_DEBUG "\tdata_len       %d\n", ui->data_len); +	printk(KERN_ERR "\tflags          %d\n", ui->flags); +	printk(KERN_ERR "\tcompr_type     %d\n", ui->compr_type); +	printk(KERN_ERR "\tlast_page_read %lu\n", ui->last_page_read); +	printk(KERN_ERR "\tread_in_a_row  %lu\n", ui->read_in_a_row); +	printk(KERN_ERR "\tdata_len       %d\n", ui->data_len);  	if (!S_ISDIR(inode->i_mode))  		return; -	printk(KERN_DEBUG "List of directory entries:\n"); +	printk(KERN_ERR "List of directory entries:\n");  	ubifs_assert(!mutex_is_locked(&c->tnc_mutex));  	lowest_dent_key(c, &key, inode->i_ino); @@ -284,11 +284,11 @@ void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)  		dent = ubifs_tnc_next_ent(c, &key, &nm);  		if (IS_ERR(dent)) {  			if (PTR_ERR(dent) != -ENOENT) -				printk(KERN_DEBUG "error %ld\n", PTR_ERR(dent)); +				printk(KERN_ERR "error %ld\n", PTR_ERR(dent));  			break;  		} -		printk(KERN_DEBUG "\t%d: %s (%s)\n", +		printk(KERN_ERR "\t%d: %s (%s)\n",  		       count++, dent->name, get_dent_type(dent->type));  		nm.name = dent->name; @@ -312,8 +312,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  	/* If the magic is incorrect, just hexdump the first bytes */  	if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) { -		printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ); -		print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, +		printk(KERN_ERR "Not a node, first %zu bytes:", UBIFS_CH_SZ); +		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1,  			       (void *)node, UBIFS_CH_SZ, 1);  		return;  	} @@ -326,7 +326,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  	{  		const struct ubifs_pad_node *pad = node; -		printk(KERN_DEBUG "\tpad_len        %u\n", +		printk(KERN_ERR "\tpad_len        %u\n",  		       le32_to_cpu(pad->pad_len));  		break;  	} @@ -335,50 +335,50 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  		const struct ubifs_sb_node *sup = node;  		unsigned int sup_flags = le32_to_cpu(sup->flags); -		printk(KERN_DEBUG "\tkey_hash       %d (%s)\n", +		printk(KERN_ERR "\tkey_hash       %d (%s)\n",  		       (int)sup->key_hash, get_key_hash(sup->key_hash)); -		printk(KERN_DEBUG "\tkey_fmt        %d (%s)\n", +		printk(KERN_ERR "\tkey_fmt        %d (%s)\n",  		       (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); -		printk(KERN_DEBUG "\tflags          %#x\n", sup_flags); -		printk(KERN_DEBUG "\t  big_lpt      %u\n", +		printk(KERN_ERR "\tflags          %#x\n", sup_flags); +		printk(KERN_ERR "\t  big_lpt      %u\n",  		       !!(sup_flags & UBIFS_FLG_BIGLPT)); -		printk(KERN_DEBUG "\t  space_fixup  %u\n", +		printk(KERN_ERR "\t  space_fixup  %u\n",  		       !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); -		printk(KERN_DEBUG "\tmin_io_size    %u\n", +		printk(KERN_ERR "\tmin_io_size    %u\n",  		       le32_to_cpu(sup->min_io_size)); -		printk(KERN_DEBUG "\tleb_size       %u\n", +		printk(KERN_ERR "\tleb_size       %u\n",  		       le32_to_cpu(sup->leb_size)); -		printk(KERN_DEBUG "\tleb_cnt        %u\n", +		printk(KERN_ERR "\tleb_cnt        %u\n",  		       le32_to_cpu(sup->leb_cnt)); -		printk(KERN_DEBUG "\tmax_leb_cnt    %u\n", +		printk(KERN_ERR "\tmax_leb_cnt    %u\n",  		       le32_to_cpu(sup->max_leb_cnt)); -		printk(KERN_DEBUG "\tmax_bud_bytes  %llu\n", +		printk(KERN_ERR "\tmax_bud_bytes  %llu\n",  		       (unsigned long long)le64_to_cpu(sup->max_bud_bytes)); -		printk(KERN_DEBUG "\tlog_lebs       %u\n", +		printk(KERN_ERR "\tlog_lebs       %u\n",  		       le32_to_cpu(sup->log_lebs)); -		printk(KERN_DEBUG "\tlpt_lebs       %u\n", +		printk(KERN_ERR "\tlpt_lebs       %u\n",  		       le32_to_cpu(sup->lpt_lebs)); -		printk(KERN_DEBUG "\torph_lebs      %u\n", +		printk(KERN_ERR "\torph_lebs      %u\n",  		       le32_to_cpu(sup->orph_lebs)); -		printk(KERN_DEBUG "\tjhead_cnt      %u\n", +		printk(KERN_ERR "\tjhead_cnt      %u\n",  		       le32_to_cpu(sup->jhead_cnt)); -		printk(KERN_DEBUG "\tfanout         %u\n", +		printk(KERN_ERR "\tfanout         %u\n",  		       le32_to_cpu(sup->fanout)); -		printk(KERN_DEBUG "\tlsave_cnt      %u\n", +		printk(KERN_ERR "\tlsave_cnt      %u\n",  		       le32_to_cpu(sup->lsave_cnt)); -		printk(KERN_DEBUG "\tdefault_compr  %u\n", +		printk(KERN_ERR "\tdefault_compr  %u\n",  		       (int)le16_to_cpu(sup->default_compr)); -		printk(KERN_DEBUG "\trp_size        %llu\n", +		printk(KERN_ERR "\trp_size        %llu\n",  		       (unsigned long long)le64_to_cpu(sup->rp_size)); -		printk(KERN_DEBUG "\trp_uid         %u\n", +		printk(KERN_ERR "\trp_uid         %u\n",  		       le32_to_cpu(sup->rp_uid)); -		printk(KERN_DEBUG "\trp_gid         %u\n", +		printk(KERN_ERR "\trp_gid         %u\n",  		       le32_to_cpu(sup->rp_gid)); -		printk(KERN_DEBUG "\tfmt_version    %u\n", +		printk(KERN_ERR "\tfmt_version    %u\n",  		       le32_to_cpu(sup->fmt_version)); -		printk(KERN_DEBUG "\ttime_gran      %u\n", +		printk(KERN_ERR "\ttime_gran      %u\n",  		       le32_to_cpu(sup->time_gran)); -		printk(KERN_DEBUG "\tUUID           %pUB\n", +		printk(KERN_ERR "\tUUID           %pUB\n",  		       sup->uuid);  		break;  	} @@ -386,61 +386,61 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  	{  		const struct ubifs_mst_node *mst = node; -		printk(KERN_DEBUG "\thighest_inum   %llu\n", +		printk(KERN_ERR "\thighest_inum   %llu\n",  		       (unsigned long long)le64_to_cpu(mst->highest_inum)); -		printk(KERN_DEBUG "\tcommit number  %llu\n", +		printk(KERN_ERR "\tcommit number  %llu\n",  		       (unsigned long long)le64_to_cpu(mst->cmt_no)); -		printk(KERN_DEBUG "\tflags          %#x\n", +		printk(KERN_ERR "\tflags          %#x\n",  		       le32_to_cpu(mst->flags)); -		printk(KERN_DEBUG "\tlog_lnum       %u\n", +		printk(KERN_ERR "\tlog_lnum       %u\n",  		       le32_to_cpu(mst->log_lnum)); -		printk(KERN_DEBUG "\troot_lnum      %u\n", +		printk(KERN_ERR "\troot_lnum      %u\n",  		       le32_to_cpu(mst->root_lnum)); -		printk(KERN_DEBUG "\troot_offs      %u\n", +		printk(KERN_ERR "\troot_offs      %u\n",  		       le32_to_cpu(mst->root_offs)); -		printk(KERN_DEBUG "\troot_len       %u\n", +		printk(KERN_ERR "\troot_len       %u\n",  		       le32_to_cpu(mst->root_len)); -		printk(KERN_DEBUG "\tgc_lnum        %u\n", +		printk(KERN_ERR "\tgc_lnum        %u\n",  		       le32_to_cpu(mst->gc_lnum)); -		printk(KERN_DEBUG "\tihead_lnum     %u\n", +		printk(KERN_ERR "\tihead_lnum     %u\n",  		       le32_to_cpu(mst->ihead_lnum)); -		printk(KERN_DEBUG "\tihead_offs     %u\n", +		printk(KERN_ERR "\tihead_offs     %u\n",  		       le32_to_cpu(mst->ihead_offs)); -		printk(KERN_DEBUG "\tindex_size     %llu\n", +		printk(KERN_ERR "\tindex_size     %llu\n",  		       (unsigned long long)le64_to_cpu(mst->index_size)); -		printk(KERN_DEBUG "\tlpt_lnum       %u\n", +		printk(KERN_ERR "\tlpt_lnum       %u\n",  		       le32_to_cpu(mst->lpt_lnum)); -		printk(KERN_DEBUG "\tlpt_offs       %u\n", +		printk(KERN_ERR "\tlpt_offs       %u\n",  		       le32_to_cpu(mst->lpt_offs)); -		printk(KERN_DEBUG "\tnhead_lnum     %u\n", +		printk(KERN_ERR "\tnhead_lnum     %u\n",  		       le32_to_cpu(mst->nhead_lnum)); -		printk(KERN_DEBUG "\tnhead_offs     %u\n", +		printk(KERN_ERR "\tnhead_offs     %u\n",  		       le32_to_cpu(mst->nhead_offs)); -		printk(KERN_DEBUG "\tltab_lnum      %u\n", +		printk(KERN_ERR "\tltab_lnum      %u\n",  		       le32_to_cpu(mst->ltab_lnum)); -		printk(KERN_DEBUG "\tltab_offs      %u\n", +		printk(KERN_ERR "\tltab_offs      %u\n",  		       le32_to_cpu(mst->ltab_offs)); -		printk(KERN_DEBUG "\tlsave_lnum     %u\n", +		printk(KERN_ERR "\tlsave_lnum     %u\n",  		       le32_to_cpu(mst->lsave_lnum)); -		printk(KERN_DEBUG "\tlsave_offs     %u\n", +		printk(KERN_ERR "\tlsave_offs     %u\n",  		       le32_to_cpu(mst->lsave_offs)); -		printk(KERN_DEBUG "\tlscan_lnum     %u\n", +		printk(KERN_ERR "\tlscan_lnum     %u\n",  		       le32_to_cpu(mst->lscan_lnum)); -		printk(KERN_DEBUG "\tleb_cnt        %u\n", +		printk(KERN_ERR "\tleb_cnt        %u\n",  		       le32_to_cpu(mst->leb_cnt)); -		printk(KERN_DEBUG "\tempty_lebs     %u\n", +		printk(KERN_ERR "\tempty_lebs     %u\n",  		       le32_to_cpu(mst->empty_lebs)); -		printk(KERN_DEBUG "\tidx_lebs       %u\n", +		printk(KERN_ERR "\tidx_lebs       %u\n",  		       le32_to_cpu(mst->idx_lebs)); -		printk(KERN_DEBUG "\ttotal_free     %llu\n", +		printk(KERN_ERR "\ttotal_free     %llu\n",  		       (unsigned long long)le64_to_cpu(mst->total_free)); -		printk(KERN_DEBUG "\ttotal_dirty    %llu\n", +		printk(KERN_ERR "\ttotal_dirty    %llu\n",  		       (unsigned long long)le64_to_cpu(mst->total_dirty)); -		printk(KERN_DEBUG "\ttotal_used     %llu\n", +		printk(KERN_ERR "\ttotal_used     %llu\n",  		       (unsigned long long)le64_to_cpu(mst->total_used)); -		printk(KERN_DEBUG "\ttotal_dead     %llu\n", +		printk(KERN_ERR "\ttotal_dead     %llu\n",  		       (unsigned long long)le64_to_cpu(mst->total_dead)); -		printk(KERN_DEBUG "\ttotal_dark     %llu\n", +		printk(KERN_ERR "\ttotal_dark     %llu\n",  		       (unsigned long long)le64_to_cpu(mst->total_dark));  		break;  	} @@ -448,11 +448,11 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  	{  		const struct ubifs_ref_node *ref = node; -		printk(KERN_DEBUG "\tlnum           %u\n", +		printk(KERN_ERR "\tlnum           %u\n",  		       le32_to_cpu(ref->lnum)); -		printk(KERN_DEBUG "\toffs           %u\n", +		printk(KERN_ERR "\toffs           %u\n",  		       le32_to_cpu(ref->offs)); -		printk(KERN_DEBUG "\tjhead          %u\n", +		printk(KERN_ERR "\tjhead          %u\n",  		       le32_to_cpu(ref->jhead));  		break;  	} @@ -461,40 +461,40 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  		const struct ubifs_ino_node *ino = node;  		key_read(c, &ino->key, &key); -		printk(KERN_DEBUG "\tkey            %s\n", +		printk(KERN_ERR "\tkey            %s\n",  		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); -		printk(KERN_DEBUG "\tcreat_sqnum    %llu\n", +		printk(KERN_ERR "\tcreat_sqnum    %llu\n",  		       (unsigned long long)le64_to_cpu(ino->creat_sqnum)); -		printk(KERN_DEBUG "\tsize           %llu\n", +		printk(KERN_ERR "\tsize           %llu\n",  		       (unsigned long long)le64_to_cpu(ino->size)); -		printk(KERN_DEBUG "\tnlink          %u\n", +		printk(KERN_ERR "\tnlink          %u\n",  		       le32_to_cpu(ino->nlink)); -		printk(KERN_DEBUG "\tatime          %lld.%u\n", +		printk(KERN_ERR "\tatime          %lld.%u\n",  		       (long long)le64_to_cpu(ino->atime_sec),  		       le32_to_cpu(ino->atime_nsec)); -		printk(KERN_DEBUG "\tmtime          %lld.%u\n", +		printk(KERN_ERR "\tmtime          %lld.%u\n",  		       (long long)le64_to_cpu(ino->mtime_sec),  		       le32_to_cpu(ino->mtime_nsec)); -		printk(KERN_DEBUG "\tctime          %lld.%u\n", +		printk(KERN_ERR "\tctime          %lld.%u\n",  		       (long long)le64_to_cpu(ino->ctime_sec),  		       le32_to_cpu(ino->ctime_nsec)); -		printk(KERN_DEBUG "\tuid            %u\n", +		printk(KERN_ERR "\tuid            %u\n",  		       le32_to_cpu(ino->uid)); -		printk(KERN_DEBUG "\tgid            %u\n", +		printk(KERN_ERR "\tgid            %u\n",  		       le32_to_cpu(ino->gid)); -		printk(KERN_DEBUG "\tmode           %u\n", +		printk(KERN_ERR "\tmode           %u\n",  		       le32_to_cpu(ino->mode)); -		printk(KERN_DEBUG "\tflags          %#x\n", +		printk(KERN_ERR "\tflags          %#x\n",  		       le32_to_cpu(ino->flags)); -		printk(KERN_DEBUG "\txattr_cnt      %u\n", +		printk(KERN_ERR "\txattr_cnt      %u\n",  		       le32_to_cpu(ino->xattr_cnt)); -		printk(KERN_DEBUG "\txattr_size     %u\n", +		printk(KERN_ERR "\txattr_size     %u\n",  		       le32_to_cpu(ino->xattr_size)); -		printk(KERN_DEBUG "\txattr_names    %u\n", +		printk(KERN_ERR "\txattr_names    %u\n",  		       le32_to_cpu(ino->xattr_names)); -		printk(KERN_DEBUG "\tcompr_type     %#x\n", +		printk(KERN_ERR "\tcompr_type     %#x\n",  		       (int)le16_to_cpu(ino->compr_type)); -		printk(KERN_DEBUG "\tdata len       %u\n", +		printk(KERN_ERR "\tdata len       %u\n",  		       le32_to_cpu(ino->data_len));  		break;  	} @@ -505,16 +505,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  		int nlen = le16_to_cpu(dent->nlen);  		key_read(c, &dent->key, &key); -		printk(KERN_DEBUG "\tkey            %s\n", +		printk(KERN_ERR "\tkey            %s\n",  		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); -		printk(KERN_DEBUG "\tinum           %llu\n", +		printk(KERN_ERR "\tinum           %llu\n",  		       (unsigned long long)le64_to_cpu(dent->inum)); -		printk(KERN_DEBUG "\ttype           %d\n", (int)dent->type); -		printk(KERN_DEBUG "\tnlen           %d\n", nlen); -		printk(KERN_DEBUG "\tname           "); +		printk(KERN_ERR "\ttype           %d\n", (int)dent->type); +		printk(KERN_ERR "\tnlen           %d\n", nlen); +		printk(KERN_ERR "\tname           ");  		if (nlen > UBIFS_MAX_NLEN) -			printk(KERN_DEBUG "(bad name length, not printing, " +			printk(KERN_ERR "(bad name length, not printing, "  					  "bad or corrupted node)");  		else {  			for (i = 0; i < nlen && dent->name[i]; i++) @@ -530,16 +530,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  		int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;  		key_read(c, &dn->key, &key); -		printk(KERN_DEBUG "\tkey            %s\n", +		printk(KERN_ERR "\tkey            %s\n",  		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); -		printk(KERN_DEBUG "\tsize           %u\n", +		printk(KERN_ERR "\tsize           %u\n",  		       le32_to_cpu(dn->size)); -		printk(KERN_DEBUG "\tcompr_typ      %d\n", +		printk(KERN_ERR "\tcompr_typ      %d\n",  		       (int)le16_to_cpu(dn->compr_type)); -		printk(KERN_DEBUG "\tdata size      %d\n", +		printk(KERN_ERR "\tdata size      %d\n",  		       dlen); -		printk(KERN_DEBUG "\tdata:\n"); -		print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1, +		printk(KERN_ERR "\tdata:\n"); +		print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,  			       (void *)&dn->data, dlen, 0);  		break;  	} @@ -547,11 +547,11 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  	{  		const struct ubifs_trun_node *trun = node; -		printk(KERN_DEBUG "\tinum           %u\n", +		printk(KERN_ERR "\tinum           %u\n",  		       le32_to_cpu(trun->inum)); -		printk(KERN_DEBUG "\told_size       %llu\n", +		printk(KERN_ERR "\told_size       %llu\n",  		       (unsigned long long)le64_to_cpu(trun->old_size)); -		printk(KERN_DEBUG "\tnew_size       %llu\n", +		printk(KERN_ERR "\tnew_size       %llu\n",  		       (unsigned long long)le64_to_cpu(trun->new_size));  		break;  	} @@ -560,17 +560,17 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  		const struct ubifs_idx_node *idx = node;  		n = le16_to_cpu(idx->child_cnt); -		printk(KERN_DEBUG "\tchild_cnt      %d\n", n); -		printk(KERN_DEBUG "\tlevel          %d\n", +		printk(KERN_ERR "\tchild_cnt      %d\n", n); +		printk(KERN_ERR "\tlevel          %d\n",  		       (int)le16_to_cpu(idx->level)); -		printk(KERN_DEBUG "\tBranches:\n"); +		printk(KERN_ERR "\tBranches:\n");  		for (i = 0; i < n && i < c->fanout - 1; i++) {  			const struct ubifs_branch *br;  			br = ubifs_idx_branch(c, idx, i);  			key_read(c, &br->key, &key); -			printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", +			printk(KERN_ERR "\t%d: LEB %d:%d len %d key %s\n",  			       i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),  			       le32_to_cpu(br->len),  			       dbg_snprintf_key(c, &key, key_buf, @@ -584,20 +584,20 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  	{  		const struct ubifs_orph_node *orph = node; -		printk(KERN_DEBUG "\tcommit number  %llu\n", +		printk(KERN_ERR "\tcommit number  %llu\n",  		       (unsigned long long)  				le64_to_cpu(orph->cmt_no) & LLONG_MAX); -		printk(KERN_DEBUG "\tlast node flag %llu\n", +		printk(KERN_ERR "\tlast node flag %llu\n",  		       (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);  		n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; -		printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); +		printk(KERN_ERR "\t%d orphan inode numbers:\n", n);  		for (i = 0; i < n; i++) -			printk(KERN_DEBUG "\t  ino %llu\n", +			printk(KERN_ERR "\t  ino %llu\n",  			       (unsigned long long)le64_to_cpu(orph->inos[i]));  		break;  	}  	default: -		printk(KERN_DEBUG "node type %d was not recognized\n", +		printk(KERN_ERR "node type %d was not recognized\n",  		       (int)ch->node_type);  	}  	spin_unlock(&dbg_lock); @@ -606,16 +606,16 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  void dbg_dump_budget_req(const struct ubifs_budget_req *req)  {  	spin_lock(&dbg_lock); -	printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n", +	printk(KERN_ERR "Budgeting request: new_ino %d, dirtied_ino %d\n",  	       req->new_ino, req->dirtied_ino); -	printk(KERN_DEBUG "\tnew_ino_d   %d, dirtied_ino_d %d\n", +	printk(KERN_ERR "\tnew_ino_d   %d, dirtied_ino_d %d\n",  	       req->new_ino_d, req->dirtied_ino_d); -	printk(KERN_DEBUG "\tnew_page    %d, dirtied_page %d\n", +	printk(KERN_ERR "\tnew_page    %d, dirtied_page %d\n",  	       req->new_page, req->dirtied_page); -	printk(KERN_DEBUG "\tnew_dent    %d, mod_dent     %d\n", +	printk(KERN_ERR "\tnew_dent    %d, mod_dent     %d\n",  	       req->new_dent, req->mod_dent); -	printk(KERN_DEBUG "\tidx_growth  %d\n", req->idx_growth); -	printk(KERN_DEBUG "\tdata_growth %d dd_growth     %d\n", +	printk(KERN_ERR "\tidx_growth  %d\n", req->idx_growth); +	printk(KERN_ERR "\tdata_growth %d dd_growth     %d\n",  	       req->data_growth, req->dd_growth);  	spin_unlock(&dbg_lock);  } @@ -623,12 +623,12 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req)  void dbg_dump_lstats(const struct ubifs_lp_stats *lst)  {  	spin_lock(&dbg_lock); -	printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, " +	printk(KERN_ERR "(pid %d) Lprops statistics: empty_lebs %d, "  	       "idx_lebs  %d\n", current->pid, lst->empty_lebs, lst->idx_lebs); -	printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, " +	printk(KERN_ERR "\ttaken_empty_lebs %d, total_free %lld, "  	       "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,  	       lst->total_dirty); -	printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, " +	printk(KERN_ERR "\ttotal_used %lld, total_dark %lld, "  	       "total_dead %lld\n", lst->total_used, lst->total_dark,  	       lst->total_dead);  	spin_unlock(&dbg_lock); @@ -644,21 +644,21 @@ void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)  	spin_lock(&c->space_lock);  	spin_lock(&dbg_lock); -	printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, " +	printk(KERN_ERR "(pid %d) Budgeting info: data budget sum %lld, "  	       "total budget sum %lld\n", current->pid,  	       bi->data_growth + bi->dd_growth,  	       bi->data_growth + bi->dd_growth + bi->idx_growth); -	printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, " +	printk(KERN_ERR "\tbudg_data_growth %lld, budg_dd_growth %lld, "  	       "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,  	       bi->idx_growth); -	printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, " +	printk(KERN_ERR "\tmin_idx_lebs %d, old_idx_sz %llu, "  	       "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,  	       bi->uncommitted_idx); -	printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n", +	printk(KERN_ERR "\tpage_budget %d, inode_budget %d, dent_budget %d\n",  	       bi->page_budget, bi->inode_budget, bi->dent_budget); -	printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n", +	printk(KERN_ERR "\tnospace %u, nospace_rp %u\n",  	       bi->nospace, bi->nospace_rp); -	printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", +	printk(KERN_ERR "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",  	       c->dark_wm, c->dead_wm, c->max_idx_node_sz);  	if (bi != &c->bi) @@ -669,38 +669,38 @@ void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)  		 */  		goto out_unlock; -	printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", +	printk(KERN_ERR "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",  	       c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt); -	printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " +	printk(KERN_ERR "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "  	       "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),  	       atomic_long_read(&c->dirty_zn_cnt),  	       atomic_long_read(&c->clean_zn_cnt)); -	printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", +	printk(KERN_ERR "\tgc_lnum %d, ihead_lnum %d\n",  	       c->gc_lnum, c->ihead_lnum);  	/* If we are in R/O mode, journal heads do not exist */  	if (c->jheads)  		for (i = 0; i < c->jhead_cnt; i++) -			printk(KERN_DEBUG "\tjhead %s\t LEB %d\n", +			printk(KERN_ERR "\tjhead %s\t LEB %d\n",  			       dbg_jhead(c->jheads[i].wbuf.jhead),  			       c->jheads[i].wbuf.lnum);  	for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {  		bud = rb_entry(rb, struct ubifs_bud, rb); -		printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); +		printk(KERN_ERR "\tbud LEB %d\n", bud->lnum);  	}  	list_for_each_entry(bud, &c->old_buds, list) -		printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum); +		printk(KERN_ERR "\told bud LEB %d\n", bud->lnum);  	list_for_each_entry(idx_gc, &c->idx_gc, list) -		printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n", +		printk(KERN_ERR "\tGC'ed idx LEB %d unmap %d\n",  		       idx_gc->lnum, idx_gc->unmap); -	printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); +	printk(KERN_ERR "\tcommit state %d\n", c->cmt_state);  	/* Print budgeting predictions */  	available = ubifs_calc_available(c, c->bi.min_idx_lebs);  	outstanding = c->bi.data_growth + c->bi.dd_growth;  	free = ubifs_get_free_space_nolock(c); -	printk(KERN_DEBUG "Budgeting predictions:\n"); -	printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", +	printk(KERN_ERR "Budgeting predictions:\n"); +	printk(KERN_ERR "\tavailable: %lld, outstanding %lld, free %lld\n",  	       available, outstanding, free);  out_unlock:  	spin_unlock(&dbg_lock); @@ -720,11 +720,11 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)  		dark = ubifs_calc_dark(c, spc);  	if (lp->flags & LPROPS_INDEX) -		printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d " +		printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d "  		       "free + dirty %-8d flags %#x (", lp->lnum, lp->free,  		       lp->dirty, c->leb_size - spc, spc, lp->flags);  	else -		printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d " +		printk(KERN_ERR "LEB %-7d free %-8d dirty %-8d used %-8d "  		       "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "  		       "flags %#-4x (", lp->lnum, lp->free, lp->dirty,  		       c->leb_size - spc, spc, dark, dead, @@ -807,7 +807,7 @@ void dbg_dump_lprops(struct ubifs_info *c)  	struct ubifs_lprops lp;  	struct ubifs_lp_stats lst; -	printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n", +	printk(KERN_ERR "(pid %d) start dumping LEB properties\n",  	       current->pid);  	ubifs_get_lp_stats(c, &lst);  	dbg_dump_lstats(&lst); @@ -819,7 +819,7 @@ void dbg_dump_lprops(struct ubifs_info *c)  		dbg_dump_lprop(c, &lp);  	} -	printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n", +	printk(KERN_ERR "(pid %d) finish dumping LEB properties\n",  	       current->pid);  } @@ -828,35 +828,35 @@ void dbg_dump_lpt_info(struct ubifs_info *c)  	int i;  	spin_lock(&dbg_lock); -	printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid); -	printk(KERN_DEBUG "\tlpt_sz:        %lld\n", c->lpt_sz); -	printk(KERN_DEBUG "\tpnode_sz:      %d\n", c->pnode_sz); -	printk(KERN_DEBUG "\tnnode_sz:      %d\n", c->nnode_sz); -	printk(KERN_DEBUG "\tltab_sz:       %d\n", c->ltab_sz); -	printk(KERN_DEBUG "\tlsave_sz:      %d\n", c->lsave_sz); -	printk(KERN_DEBUG "\tbig_lpt:       %d\n", c->big_lpt); -	printk(KERN_DEBUG "\tlpt_hght:      %d\n", c->lpt_hght); -	printk(KERN_DEBUG "\tpnode_cnt:     %d\n", c->pnode_cnt); -	printk(KERN_DEBUG "\tnnode_cnt:     %d\n", c->nnode_cnt); -	printk(KERN_DEBUG "\tdirty_pn_cnt:  %d\n", c->dirty_pn_cnt); -	printk(KERN_DEBUG "\tdirty_nn_cnt:  %d\n", c->dirty_nn_cnt); -	printk(KERN_DEBUG "\tlsave_cnt:     %d\n", c->lsave_cnt); -	printk(KERN_DEBUG "\tspace_bits:    %d\n", c->space_bits); -	printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); -	printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); -	printk(KERN_DEBUG "\tlpt_spc_bits:  %d\n", c->lpt_spc_bits); -	printk(KERN_DEBUG "\tpcnt_bits:     %d\n", c->pcnt_bits); -	printk(KERN_DEBUG "\tlnum_bits:     %d\n", c->lnum_bits); -	printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); -	printk(KERN_DEBUG "\tLPT head is at %d:%d\n", +	printk(KERN_ERR "(pid %d) dumping LPT information\n", current->pid); +	printk(KERN_ERR "\tlpt_sz:        %lld\n", c->lpt_sz); +	printk(KERN_ERR "\tpnode_sz:      %d\n", c->pnode_sz); +	printk(KERN_ERR "\tnnode_sz:      %d\n", c->nnode_sz); +	printk(KERN_ERR "\tltab_sz:       %d\n", c->ltab_sz); +	printk(KERN_ERR "\tlsave_sz:      %d\n", c->lsave_sz); +	printk(KERN_ERR "\tbig_lpt:       %d\n", c->big_lpt); +	printk(KERN_ERR "\tlpt_hght:      %d\n", c->lpt_hght); +	printk(KERN_ERR "\tpnode_cnt:     %d\n", c->pnode_cnt); +	printk(KERN_ERR "\tnnode_cnt:     %d\n", c->nnode_cnt); +	printk(KERN_ERR "\tdirty_pn_cnt:  %d\n", c->dirty_pn_cnt); +	printk(KERN_ERR "\tdirty_nn_cnt:  %d\n", c->dirty_nn_cnt); +	printk(KERN_ERR "\tlsave_cnt:     %d\n", c->lsave_cnt); +	printk(KERN_ERR "\tspace_bits:    %d\n", c->space_bits); +	printk(KERN_ERR "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); +	printk(KERN_ERR "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); +	printk(KERN_ERR "\tlpt_spc_bits:  %d\n", c->lpt_spc_bits); +	printk(KERN_ERR "\tpcnt_bits:     %d\n", c->pcnt_bits); +	printk(KERN_ERR "\tlnum_bits:     %d\n", c->lnum_bits); +	printk(KERN_ERR "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); +	printk(KERN_ERR "\tLPT head is at %d:%d\n",  	       c->nhead_lnum, c->nhead_offs); -	printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", +	printk(KERN_ERR "\tLPT ltab is at %d:%d\n",  	       c->ltab_lnum, c->ltab_offs);  	if (c->big_lpt) -		printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n", +		printk(KERN_ERR "\tLPT lsave is at %d:%d\n",  		       c->lsave_lnum, c->lsave_offs);  	for (i = 0; i < c->lpt_lebs; i++) -		printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d " +		printk(KERN_ERR "\tLPT LEB %d free %d dirty %d tgc %d "  		       "cmt %d\n", i + c->lpt_first, c->ltab[i].free,  		       c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt);  	spin_unlock(&dbg_lock); @@ -867,12 +867,12 @@ void dbg_dump_sleb(const struct ubifs_info *c,  {  	struct ubifs_scan_node *snod; -	printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n", +	printk(KERN_ERR "(pid %d) start dumping scanned data from LEB %d:%d\n",  	       current->pid, sleb->lnum, offs);  	list_for_each_entry(snod, &sleb->nodes, list) {  		cond_resched(); -		printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum, +		printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", sleb->lnum,  		       snod->offs, snod->len);  		dbg_dump_node(c, snod->node);  	} @@ -887,7 +887,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)  	if (dbg_is_tst_rcvry(c))  		return; -	printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", +	printk(KERN_ERR "(pid %d) start dumping LEB %d\n",  	       current->pid, lnum);  	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); @@ -902,17 +902,17 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)  		goto out;  	} -	printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, +	printk(KERN_ERR "LEB %d has %d nodes ending at %d\n", lnum,  	       sleb->nodes_cnt, sleb->endpt);  	list_for_each_entry(snod, &sleb->nodes, list) {  		cond_resched(); -		printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum, +		printk(KERN_ERR "Dumping node at LEB %d:%d len %d\n", lnum,  		       snod->offs, snod->len);  		dbg_dump_node(c, snod->node);  	} -	printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", +	printk(KERN_ERR "(pid %d) finish dumping LEB %d\n",  	       current->pid, lnum);  	ubifs_scan_destroy(sleb); @@ -934,7 +934,7 @@ void dbg_dump_znode(const struct ubifs_info *c,  	else  		zbr = &c->zroot; -	printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d" +	printk(KERN_ERR "znode %p, LEB %d:%d len %d parent %p iip %d level %d"  	       " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,  	       zbr->len, znode->parent, znode->iip, znode->level,  	       znode->child_cnt, znode->flags); @@ -944,18 +944,18 @@ void dbg_dump_znode(const struct ubifs_info *c,  		return;  	} -	printk(KERN_DEBUG "zbranches:\n"); +	printk(KERN_ERR "zbranches:\n");  	for (n = 0; n < znode->child_cnt; n++) {  		zbr = &znode->zbranch[n];  		if (znode->level > 0) -			printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " +			printk(KERN_ERR "\t%d: znode %p LEB %d:%d len %d key "  					  "%s\n", n, zbr->znode, zbr->lnum,  					  zbr->offs, zbr->len,  					  dbg_snprintf_key(c, &zbr->key,  							   key_buf,  							   DBG_KEY_BUF_LEN));  		else -			printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " +			printk(KERN_ERR "\t%d: LNC %p LEB %d:%d len %d key "  					  "%s\n", n, zbr->znode, zbr->lnum,  					  zbr->offs, zbr->len,  					  dbg_snprintf_key(c, &zbr->key, @@ -969,16 +969,16 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)  {  	int i; -	printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n", +	printk(KERN_ERR "(pid %d) start dumping heap cat %d (%d elements)\n",  	       current->pid, cat, heap->cnt);  	for (i = 0; i < heap->cnt; i++) {  		struct ubifs_lprops *lprops = heap->arr[i]; -		printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d " +		printk(KERN_ERR "\t%d. LEB %d hpos %d free %d dirty %d "  		       "flags %d\n", i, lprops->lnum, lprops->hpos,  		       lprops->free, lprops->dirty, lprops->flags);  	} -	printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid); +	printk(KERN_ERR "(pid %d) finish dumping heap\n", current->pid);  }  void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, @@ -986,15 +986,15 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,  {  	int i; -	printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid); -	printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", +	printk(KERN_ERR "(pid %d) dumping pnode:\n", current->pid); +	printk(KERN_ERR "\taddress %zx parent %zx cnext %zx\n",  	       (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); -	printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", +	printk(KERN_ERR "\tflags %lu iip %d level %d num %d\n",  	       pnode->flags, iip, pnode->level, pnode->num);  	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {  		struct ubifs_lprops *lp = &pnode->lprops[i]; -		printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n", +		printk(KERN_ERR "\t%d: free %d dirty %d flags %d lnum %d\n",  		       i, lp->free, lp->dirty, lp->flags, lp->lnum);  	}  } @@ -1004,20 +1004,20 @@ void dbg_dump_tnc(struct ubifs_info *c)  	struct ubifs_znode *znode;  	int level; -	printk(KERN_DEBUG "\n"); -	printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid); +	printk(KERN_ERR "\n"); +	printk(KERN_ERR "(pid %d) start dumping TNC tree\n", current->pid);  	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);  	level = znode->level; -	printk(KERN_DEBUG "== Level %d ==\n", level); +	printk(KERN_ERR "== Level %d ==\n", level);  	while (znode) {  		if (level != znode->level) {  			level = znode->level; -			printk(KERN_DEBUG "== Level %d ==\n", level); +			printk(KERN_ERR "== Level %d ==\n", level);  		}  		dbg_dump_znode(c, znode);  		znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);  	} -	printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid); +	printk(KERN_ERR "(pid %d) finish dumping TNC tree\n", current->pid);  }  static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index ad1a6fee601..9f717655df1 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -164,9 +164,7 @@ struct ubifs_global_debug_info {  #define dbg_dump_stack() dump_stack()  #define dbg_err(fmt, ...) do {                                                 \ -	spin_lock(&dbg_lock);                                                  \  	ubifs_err(fmt, ##__VA_ARGS__);                                         \ -	spin_unlock(&dbg_lock);                                                \  } while (0)  #define ubifs_dbg_msg(type, fmt, ...) \ @@ -217,7 +215,6 @@ struct ubifs_global_debug_info {  /* Additional recovery messages */  #define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) -extern spinlock_t dbg_lock;  extern struct ubifs_global_debug_info ubifs_dbg;  static inline int dbg_is_chk_gen(const struct ubifs_info *c) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index d6fe1c79f18..ec9f1870ab7 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -566,6 +566,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)  	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);  	int err, budgeted = 1;  	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; +	unsigned int saved_nlink = inode->i_nlink;  	/*  	 * Budget request settings: deletion direntry, deletion inode (+1 for @@ -613,7 +614,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)  out_cancel:  	dir->i_size += sz_change;  	dir_ui->ui_size = dir->i_size; -	inc_nlink(inode); +	set_nlink(inode, saved_nlink);  	unlock_2_inodes(dir, inode);  	if (budgeted)  		ubifs_release_budget(c, &req); @@ -704,8 +705,7 @@ out_cancel:  	dir->i_size += sz_change;  	dir_ui->ui_size = dir->i_size;  	inc_nlink(dir); -	inc_nlink(inode); -	inc_nlink(inode); +	set_nlink(inode, 2);  	unlock_2_inodes(dir, inode);  	if (budgeted)  		ubifs_release_budget(c, &req); @@ -977,6 +977,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,  	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,  			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };  	struct timespec time; +	unsigned int saved_nlink;  	/*  	 * Budget request settings: deletion direntry, new direntry, removing @@ -1059,13 +1060,14 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,  	if (unlink) {  		/*  		 * Directories cannot have hard-links, so if this is a -		 * directory, decrement its @i_nlink twice because an empty -		 * directory has @i_nlink 2. +		 * directory, just clear @i_nlink.  		 */ +		saved_nlink = new_inode->i_nlink;  		if (is_dir) +			clear_nlink(new_inode); +		else  			drop_nlink(new_inode);  		new_inode->i_ctime = time; -		drop_nlink(new_inode);  	} else {  		new_dir->i_size += new_sz;  		ubifs_inode(new_dir)->ui_size = new_dir->i_size; @@ -1102,9 +1104,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,  out_cancel:  	if (unlink) { -		if (is_dir) -			inc_nlink(new_inode); -		inc_nlink(new_inode); +		set_nlink(new_inode, saved_nlink);  	} else {  		new_dir->i_size -= new_sz;  		ubifs_inode(new_dir)->ui_size = new_dir->i_size; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f9c234bf33d..5c8f6dc1d28 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1042,10 +1042,10 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)  	 * the page size, the remaining memory is zeroed when mapped, and  	 * writes to that region are not written out to the file."  	 */ -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);  	flush_dcache_page(page); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	if (i_size > synced_i_size) {  		err = inode->i_sb->s_op->write_inode(inode, NULL); diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index ee4f43f4bb9..2a935b31723 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -679,7 +679,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,  			   ret == SCANNED_GARBAGE     ||  			   ret == SCANNED_A_BAD_PAD_NODE ||  			   ret == SCANNED_A_CORRUPT_NODE) { -			dbg_rcvry("found corruption - %d", ret); +			dbg_rcvry("found corruption (%d) at %d:%d", +				  ret, lnum, offs);  			break;  		} else {  			dbg_err("unexpected return value %d", ret); diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 6094c5a5d7a..771f7fb6ce9 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -410,13 +410,23 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)  	}  	if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) { -		err = 7; +		ubifs_err("too few main LEBs count %d, must be at least %d", +			  c->main_lebs, UBIFS_MIN_MAIN_LEBS);  		goto failed;  	} -	if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS || -	    c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) { -		err = 8; +	max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS; +	if (c->max_bud_bytes < max_bytes) { +		ubifs_err("too small journal (%lld bytes), must be at least " +			  "%lld bytes",  c->max_bud_bytes, max_bytes); +		goto failed; +	} + +	max_bytes = (long long)c->leb_size * c->main_lebs; +	if (c->max_bud_bytes > max_bytes) { +		ubifs_err("too large journal size (%lld bytes), only %lld bytes" +			  "available in the main area", +			  c->max_bud_bytes, max_bytes);  		goto failed;  	} @@ -450,7 +460,6 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)  		goto failed;  	} -	max_bytes = c->main_lebs * (long long)c->leb_size;  	if (c->rp_size < 0 || max_bytes < c->rp_size) {  		err = 14;  		goto failed; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 63765d58445..76e4e0566ad 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2076,15 +2076,13 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)  		goto out_umount;  	} -	sb->s_root = d_alloc_root(root); +	sb->s_root = d_make_root(root);  	if (!sb->s_root) -		goto out_iput; +		goto out_umount;  	mutex_unlock(&c->umount_mutex);  	return 0; -out_iput: -	iput(root);  out_umount:  	ubifs_umount(c);  out_unlock: diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 12e94774aa8..93d59aceaae 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -84,9 +84,6 @@  #define INUM_WARN_WATERMARK 0xFFF00000  #define INUM_WATERMARK      0xFFFFFF00 -/* Largest key size supported in this implementation */ -#define CUR_MAX_KEY_LEN UBIFS_SK_LEN -  /* Maximum number of entries in each LPT (LEB category) heap */  #define LPT_HEAP_SZ 256 @@ -277,10 +274,10 @@ struct ubifs_old_idx {  /* The below union makes it easier to deal with keys */  union ubifs_key { -	uint8_t u8[CUR_MAX_KEY_LEN]; -	uint32_t u32[CUR_MAX_KEY_LEN/4]; -	uint64_t u64[CUR_MAX_KEY_LEN/8]; -	__le32 j32[CUR_MAX_KEY_LEN/4]; +	uint8_t u8[UBIFS_SK_LEN]; +	uint32_t u32[UBIFS_SK_LEN/4]; +	uint64_t u64[UBIFS_SK_LEN/8]; +	__le32 j32[UBIFS_SK_LEN/4];  };  /** diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 987585bb0a1..1ba2baaf436 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -105,7 +105,6 @@ static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)  }  static void udf_bitmap_free_blocks(struct super_block *sb, -				   struct inode *inode,  				   struct udf_bitmap *bitmap,  				   struct kernel_lb_addr *bloc,  				   uint32_t offset, @@ -172,7 +171,6 @@ error_return:  }  static int udf_bitmap_prealloc_blocks(struct super_block *sb, -				      struct inode *inode,  				      struct udf_bitmap *bitmap,  				      uint16_t partition, uint32_t first_block,  				      uint32_t block_count) @@ -223,7 +221,6 @@ out:  }  static int udf_bitmap_new_block(struct super_block *sb, -				struct inode *inode,  				struct udf_bitmap *bitmap, uint16_t partition,  				uint32_t goal, int *err)  { @@ -349,7 +346,6 @@ error_return:  }  static void udf_table_free_blocks(struct super_block *sb, -				  struct inode *inode,  				  struct inode *table,  				  struct kernel_lb_addr *bloc,  				  uint32_t offset, @@ -581,7 +577,6 @@ error_return:  }  static int udf_table_prealloc_blocks(struct super_block *sb, -				     struct inode *inode,  				     struct inode *table, uint16_t partition,  				     uint32_t first_block, uint32_t block_count)  { @@ -643,7 +638,6 @@ static int udf_table_prealloc_blocks(struct super_block *sb,  }  static int udf_table_new_block(struct super_block *sb, -			       struct inode *inode,  			       struct inode *table, uint16_t partition,  			       uint32_t goal, int *err)  { @@ -743,18 +737,23 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode,  	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];  	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { -		udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap, +		udf_bitmap_free_blocks(sb, map->s_uspace.s_bitmap,  				       bloc, offset, count);  	} else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { -		udf_table_free_blocks(sb, inode, map->s_uspace.s_table, +		udf_table_free_blocks(sb, map->s_uspace.s_table,  				      bloc, offset, count);  	} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { -		udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap, +		udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap,  				       bloc, offset, count);  	} else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { -		udf_table_free_blocks(sb, inode, map->s_fspace.s_table, +		udf_table_free_blocks(sb, map->s_fspace.s_table,  				      bloc, offset, count);  	} + +	if (inode) { +		inode_sub_bytes(inode, +				((sector_t)count) << sb->s_blocksize_bits); +	}  }  inline int udf_prealloc_blocks(struct super_block *sb, @@ -763,29 +762,34 @@ inline int udf_prealloc_blocks(struct super_block *sb,  			       uint32_t block_count)  {  	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; +	sector_t allocated;  	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) -		return udf_bitmap_prealloc_blocks(sb, inode, -						  map->s_uspace.s_bitmap, -						  partition, first_block, -						  block_count); +		allocated = udf_bitmap_prealloc_blocks(sb, +						       map->s_uspace.s_bitmap, +						       partition, first_block, +						       block_count);  	else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) -		return udf_table_prealloc_blocks(sb, inode, -						 map->s_uspace.s_table, -						 partition, first_block, -						 block_count); +		allocated = udf_table_prealloc_blocks(sb, +						      map->s_uspace.s_table, +						      partition, first_block, +						      block_count);  	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) -		return udf_bitmap_prealloc_blocks(sb, inode, -						  map->s_fspace.s_bitmap, -						  partition, first_block, -						  block_count); +		allocated = udf_bitmap_prealloc_blocks(sb, +						       map->s_fspace.s_bitmap, +						       partition, first_block, +						       block_count);  	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) -		return udf_table_prealloc_blocks(sb, inode, -						 map->s_fspace.s_table, -						 partition, first_block, -						 block_count); +		allocated = udf_table_prealloc_blocks(sb, +						      map->s_fspace.s_table, +						      partition, first_block, +						      block_count);  	else  		return 0; + +	if (inode && allocated > 0) +		inode_add_bytes(inode, allocated << sb->s_blocksize_bits); +	return allocated;  }  inline int udf_new_block(struct super_block *sb, @@ -793,25 +797,29 @@ inline int udf_new_block(struct super_block *sb,  			 uint16_t partition, uint32_t goal, int *err)  {  	struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; +	int block;  	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) -		return udf_bitmap_new_block(sb, inode, -					   map->s_uspace.s_bitmap, -					   partition, goal, err); +		block = udf_bitmap_new_block(sb, +					     map->s_uspace.s_bitmap, +					     partition, goal, err);  	else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) -		return udf_table_new_block(sb, inode, -					   map->s_uspace.s_table, -					   partition, goal, err); -	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) -		return udf_bitmap_new_block(sb, inode, -					    map->s_fspace.s_bitmap, +		block = udf_table_new_block(sb, +					    map->s_uspace.s_table,  					    partition, goal, err); +	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) +		block = udf_bitmap_new_block(sb, +					     map->s_fspace.s_bitmap, +					     partition, goal, err);  	else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) -		return udf_table_new_block(sb, inode, -					   map->s_fspace.s_table, -					   partition, goal, err); +		block = udf_table_new_block(sb, +					    map->s_fspace.s_table, +					    partition, goal, err);  	else {  		*err = -EIO;  		return 0;  	} +	if (inode && block) +		inode_add_bytes(inode, sb->s_blocksize); +	return block;  } diff --git a/fs/udf/file.c b/fs/udf/file.c index dca0c3881e8..7f3f7ba3df6 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -87,10 +87,10 @@ static int udf_adinicb_write_end(struct file *file,  	char *kaddr;  	struct udf_inode_info *iinfo = UDF_I(inode); -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,  		kaddr + offset, copied); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	return simple_write_end(file, mapping, pos, len, copied, page, fsdata);  } @@ -201,12 +201,10 @@ out:  static int udf_release_file(struct inode *inode, struct file *filp)  {  	if (filp->f_mode & FMODE_WRITE) { -		mutex_lock(&inode->i_mutex);  		down_write(&UDF_I(inode)->i_data_sem);  		udf_discard_prealloc(inode);  		udf_truncate_tail_extent(inode);  		up_write(&UDF_I(inode)->i_data_sem); -		mutex_unlock(&inode->i_mutex);  	}  	return 0;  } diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 05ab48195be..7e5aae4bf46 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -116,6 +116,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)  	iinfo->i_lenEAttr = 0;  	iinfo->i_lenAlloc = 0;  	iinfo->i_use = 0; +	iinfo->i_checkpoint = 1;  	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))  		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;  	else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7699df7b319..7d752800835 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1358,6 +1358,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)  		iinfo->i_unique = le64_to_cpu(fe->uniqueID);  		iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);  		iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs); +		iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint);  		offset = sizeof(struct fileEntry) + iinfo->i_lenEAttr;  	} else {  		inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << @@ -1379,6 +1380,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)  		iinfo->i_unique = le64_to_cpu(efe->uniqueID);  		iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);  		iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); +		iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);  		offset = sizeof(struct extendedFileEntry) +  							iinfo->i_lenEAttr;  	} @@ -1495,6 +1497,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)  	struct buffer_head *bh = NULL;  	struct fileEntry *fe;  	struct extendedFileEntry *efe; +	uint64_t lb_recorded;  	uint32_t udfperms;  	uint16_t icbflags;  	uint16_t crclen; @@ -1589,13 +1592,18 @@ static int udf_update_inode(struct inode *inode, int do_sync)  		dsea->minorDeviceIdent = cpu_to_le32(iminor(inode));  	} +	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) +		lb_recorded = 0; /* No extents => no blocks! */ +	else +		lb_recorded = +			(inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> +			(blocksize_bits - 9); +  	if (iinfo->i_efe == 0) {  		memcpy(bh->b_data + sizeof(struct fileEntry),  		       iinfo->i_ext.i_data,  		       inode->i_sb->s_blocksize - sizeof(struct fileEntry)); -		fe->logicalBlocksRecorded = cpu_to_le64( -			(inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> -			(blocksize_bits - 9)); +		fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);  		udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);  		udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); @@ -1607,6 +1615,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)  		fe->uniqueID = cpu_to_le64(iinfo->i_unique);  		fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);  		fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); +		fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);  		fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE);  		crclen = sizeof(struct fileEntry);  	} else { @@ -1615,9 +1624,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)  		       inode->i_sb->s_blocksize -  					sizeof(struct extendedFileEntry));  		efe->objectSize = cpu_to_le64(inode->i_size); -		efe->logicalBlocksRecorded = cpu_to_le64( -			(inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> -			(blocksize_bits - 9)); +		efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);  		if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec ||  		    (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec && @@ -1646,6 +1653,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)  		efe->uniqueID = cpu_to_le64(iinfo->i_unique);  		efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);  		efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); +		efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);  		efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE);  		crclen = sizeof(struct extendedFileEntry);  	} diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 08bf46edf9c..38de8f234b9 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -32,8 +32,6 @@  #include <linux/crc-itu-t.h>  #include <linux/exportfs.h> -enum { UDF_MAX_LINKS = 0xffff }; -  static inline int udf_match(int len1, const unsigned char *name1, int len2,  			    const unsigned char *name2)  { @@ -649,10 +647,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)  	struct udf_inode_info *dinfo = UDF_I(dir);  	struct udf_inode_info *iinfo; -	err = -EMLINK; -	if (dir->i_nlink >= UDF_MAX_LINKS) -		goto out; -  	err = -EIO;  	inode = udf_new_inode(dir, S_IFDIR | mode, &err);  	if (!inode) @@ -1032,9 +1026,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,  	struct fileIdentDesc cfi, *fi;  	int err; -	if (inode->i_nlink >= UDF_MAX_LINKS) -		return -EMLINK; -  	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);  	if (!fi) {  		return err; @@ -1126,10 +1117,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,  		if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=  				old_dir->i_ino)  			goto end_rename; - -		retval = -EMLINK; -		if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS) -			goto end_rename;  	}  	if (!nfi) {  		nfi = udf_add_entry(new_dir, new_dentry, &nfibh, &ncfi, diff --git a/fs/udf/super.c b/fs/udf/super.c index c09a84daaf5..ac8a348dcb6 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -75,6 +75,8 @@  #define UDF_DEFAULT_BLOCKSIZE 2048 +enum { UDF_MAX_LINKS = 0xffff }; +  /* These are the "meat" - everything else is stuffing */  static int udf_fill_super(struct super_block *, void *, int);  static void udf_put_super(struct super_block *); @@ -948,11 +950,8 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)  	else  		bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ -	if (bitmap == NULL) { -		udf_err(sb, "Unable to allocate space for bitmap and %d buffer_head pointers\n", -			nr_groups); +	if (bitmap == NULL)  		return NULL; -	}  	bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);  	bitmap->s_nr_groups = nr_groups; @@ -2035,13 +2034,13 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)  	}  	/* Allocate a dentry for the root inode */ -	sb->s_root = d_alloc_root(inode); +	sb->s_root = d_make_root(inode);  	if (!sb->s_root) {  		udf_err(sb, "Couldn't allocate root dentry\n"); -		iput(inode);  		goto error_out;  	}  	sb->s_maxbytes = MAX_LFS_FILESIZE; +	sb->s_max_links = UDF_MAX_LINKS;  	return 0;  error_out: diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index d1bd31ea724..bb8309dcd5c 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -23,6 +23,7 @@ struct udf_inode_info {  	__u64			i_lenExtents;  	__u32			i_next_alloc_block;  	__u32			i_next_alloc_goal; +	__u32			i_checkpoint;  	unsigned		i_alloc_type : 3;  	unsigned		i_efe : 1;	/* extendedFileEntry */  	unsigned		i_use : 1;	/* unallocSpaceEntry */ diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 38cac199edf..a2281cadefa 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -166,10 +166,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,  	int error;  	lock_ufs(dir->i_sb); -	if (inode->i_nlink >= UFS_LINK_MAX) { -		unlock_ufs(dir->i_sb); -		return -EMLINK; -	}  	inode->i_ctime = CURRENT_TIME_SEC;  	inode_inc_link_count(inode); @@ -183,10 +179,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,  static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)  {  	struct inode * inode; -	int err = -EMLINK; - -	if (dir->i_nlink >= UFS_LINK_MAX) -		goto out; +	int err;  	lock_ufs(dir->i_sb);  	inode_inc_link_count(dir); @@ -305,11 +298,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,  			drop_nlink(new_inode);  		inode_dec_link_count(new_inode);  	} else { -		if (dir_de) { -			err = -EMLINK; -			if (new_dir->i_nlink >= UFS_LINK_MAX) -				goto out_dir; -		}  		err = ufs_add_link(new_dentry, old_inode);  		if (err)  			goto out_dir; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 5246ee3e560..f636f6b460d 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1157,16 +1157,17 @@ magic_found:  			    "fast symlink size (%u)\n", uspi->s_maxsymlinklen);  		uspi->s_maxsymlinklen = maxsymlen;  	} +	sb->s_max_links = UFS_LINK_MAX;  	inode = ufs_iget(sb, UFS_ROOTINO);  	if (IS_ERR(inode)) {  		ret = PTR_ERR(inode);  		goto failed;  	} -	sb->s_root = d_alloc_root(inode); +	sb->s_root = d_make_root(inode);  	if (!sb->s_root) {  		ret = -ENOMEM; -		goto dalloc_failed; +		goto failed;  	}  	ufs_setup_cstotal(sb); @@ -1180,8 +1181,6 @@ magic_found:  	UFSD("EXIT\n");  	return 0; -dalloc_failed: -	iput(inode);  failed:  	if (ubh)  		ubh_brelse_uspi (uspi); diff --git a/fs/xattr.c b/fs/xattr.c index 82f43376c7c..d6dfd247bb2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -16,7 +16,7 @@  #include <linux/security.h>  #include <linux/evm.h>  #include <linux/syscalls.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/fsnotify.h>  #include <linux/audit.h>  #include <asm/uaccess.h> diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c index 8d5a506c82e..69d06b07b16 100644 --- a/fs/xattr_acl.c +++ b/fs/xattr_acl.c @@ -5,7 +5,7 @@   * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>   */ -#include <linux/module.h> +#include <linux/export.h>  #include <linux/fs.h>  #include <linux/posix_acl_xattr.h>  #include <linux/gfp.h> diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 866de277079..e44ef7ee8ce 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -118,17 +118,6 @@ xfs_rename(  	new_parent = (src_dp != target_dp);  	src_is_directory = S_ISDIR(src_ip->i_d.di_mode); -	if (src_is_directory) { -		/* -		 * Check for link count overflow on target_dp -		 */ -		if (target_ip == NULL && new_parent && -		    target_dp->i_d.di_nlink >= XFS_MAXLINK) { -			error = XFS_ERROR(EMLINK); -			goto std_return; -		} -	} -  	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,  				inodes, &num_inodes); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e1c623b43ab..dab9a5f6dfd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1358,6 +1358,7 @@ xfs_fs_fill_super(  	sb->s_blocksize = mp->m_sb.sb_blocksize;  	sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;  	sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); +	sb->s_max_links = XFS_MAXLINK;  	sb->s_time_gran = 1;  	set_posix_acl_flag(sb); @@ -1378,10 +1379,10 @@ xfs_fs_fill_super(  		error = EINVAL;  		goto out_syncd_stop;  	} -	sb->s_root = d_alloc_root(root); +	sb->s_root = d_make_root(root);  	if (!sb->s_root) {  		error = ENOMEM; -		goto out_iput; +		goto out_syncd_stop;  	}  	return 0; @@ -1402,8 +1403,6 @@ out_destroy_workqueues:   out:  	return -error; - out_iput: -	iput(root);   out_syncd_stop:  	xfs_syncd_stop(mp);   out_unmount: diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 89dbb4a5087..79c05ac85bf 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -296,8 +296,6 @@ xfs_bumplink(  	xfs_trans_t *tp,  	xfs_inode_t *ip)  { -	if (ip->i_d.di_nlink >= XFS_MAXLINK) -		return XFS_ERROR(EMLINK);  	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);  	ASSERT(ip->i_d.di_nlink > 0); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index ebdb88840a4..64981d7e737 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -917,14 +917,6 @@ xfs_create(  	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);  	unlock_dp_on_error = B_TRUE; -	/* -	 * Check for directory link count overflow. -	 */ -	if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) { -		error = XFS_ERROR(EMLINK); -		goto out_trans_cancel; -	} -  	xfs_bmap_init(&free_list, &first_block);  	/* @@ -1429,14 +1421,6 @@ xfs_link(  	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);  	/* -	 * If the source has too many links, we can't make any more to it. -	 */ -	if (sip->i_d.di_nlink >= XFS_MAXLINK) { -		error = XFS_ERROR(EMLINK); -		goto error_return; -	} - -	/*  	 * If we are using project inheritance, we only allow hard link  	 * creation in our tree when the project IDs are the same; else  	 * the tree quota mechanism could be circumvented.  |