summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/configfs/configfs_internal.h3
-rw-r--r--fs/configfs/dir.c210
-rw-r--r--fs/configfs/symlink.c26
-rw-r--r--fs/ocfs2/aops.c29
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/journal.c173
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/ocfs2_fs.h5
-rw-r--r--fs/ocfs2/super.c12
10 files changed, 367 insertions, 98 deletions
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index da015c12e3ea..762d287123ca 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -49,8 +49,10 @@ struct configfs_dirent {
#define CONFIGFS_USET_DEFAULT 0x0080
#define CONFIGFS_USET_DROPPING 0x0100
#define CONFIGFS_USET_IN_MKDIR 0x0200
+#define CONFIGFS_USET_CREATING 0x0400
#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
+extern struct mutex configfs_symlink_mutex;
extern spinlock_t configfs_dirent_lock;
extern struct vfsmount * configfs_mount;
@@ -66,6 +68,7 @@ extern void configfs_inode_exit(void);
extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
extern int configfs_make_dirent(struct configfs_dirent *,
struct dentry *, void *, umode_t, int);
+extern int configfs_dirent_is_ready(struct configfs_dirent *);
extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 179589be063a..7a8db78a91d2 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -185,7 +185,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
if (!error)
error = configfs_make_dirent(p->d_fsdata, d, k, mode,
- CONFIGFS_DIR);
+ CONFIGFS_DIR | CONFIGFS_USET_CREATING);
if (!error) {
error = configfs_create(d, mode, init_dir);
if (!error) {
@@ -209,6 +209,9 @@ static int create_dir(struct config_item * k, struct dentry * p,
* configfs_create_dir - create a directory for an config_item.
* @item: config_itemwe're creating directory for.
* @dentry: config_item's dentry.
+ *
+ * Note: user-created entries won't be allowed under this new directory
+ * until it is validated by configfs_dir_set_ready()
*/
static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
@@ -231,6 +234,44 @@ static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
return error;
}
+/*
+ * Allow userspace to create new entries under a new directory created with
+ * configfs_create_dir(), and under all of its chidlren directories recursively.
+ * @sd configfs_dirent of the new directory to validate
+ *
+ * Caller must hold configfs_dirent_lock.
+ */
+static void configfs_dir_set_ready(struct configfs_dirent *sd)
+{
+ struct configfs_dirent *child_sd;
+
+ sd->s_type &= ~CONFIGFS_USET_CREATING;
+ list_for_each_entry(child_sd, &sd->s_children, s_sibling)
+ if (child_sd->s_type & CONFIGFS_USET_CREATING)
+ configfs_dir_set_ready(child_sd);
+}
+
+/*
+ * Check that a directory does not belong to a directory hierarchy being
+ * attached and not validated yet.
+ * @sd configfs_dirent of the directory to check
+ *
+ * @return non-zero iff the directory was validated
+ *
+ * Note: takes configfs_dirent_lock, so the result may change from false to true
+ * in two consecutive calls, but never from true to false.
+ */
+int configfs_dirent_is_ready(struct configfs_dirent *sd)
+{
+ int ret;
+
+ spin_lock(&configfs_dirent_lock);
+ ret = !(sd->s_type & CONFIGFS_USET_CREATING);
+ spin_unlock(&configfs_dirent_lock);
+
+ return ret;
+}
+
int configfs_create_link(struct configfs_symlink *sl,
struct dentry *parent,
struct dentry *dentry)
@@ -283,6 +324,8 @@ static void remove_dir(struct dentry * d)
* The only thing special about this is that we remove any files in
* the directory before we remove the directory, and we've inlined
* what used to be configfs_rmdir() below, instead of calling separately.
+ *
+ * Caller holds the mutex of the item's inode
*/
static void configfs_remove_dir(struct config_item * item)
@@ -330,7 +373,19 @@ static struct dentry * configfs_lookup(struct inode *dir,
struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
struct configfs_dirent * sd;
int found = 0;
- int err = 0;
+ int err;
+
+ /*
+ * Fake invisibility if dir belongs to a group/default groups hierarchy
+ * being attached
+ *
+ * This forbids userspace to read/write attributes of items which may
+ * not complete their initialization, since the dentries of the
+ * attributes won't be instantiated.
+ */
+ err = -ENOENT;
+ if (!configfs_dirent_is_ready(parent_sd))
+ goto out;
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
if (sd->s_type & CONFIGFS_NOT_PINNED) {
@@ -353,6 +408,7 @@ static struct dentry * configfs_lookup(struct inode *dir,
return simple_lookup(dir, dentry, nd);
}
+out:
return ERR_PTR(err);
}
@@ -370,13 +426,17 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex
struct configfs_dirent *sd;
int ret;
+ /* Mark that we're trying to drop the group */
+ parent_sd->s_type |= CONFIGFS_USET_DROPPING;
+
ret = -EBUSY;
if (!list_empty(&parent_sd->s_links))
goto out;
ret = 0;
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
- if (sd->s_type & CONFIGFS_NOT_PINNED)
+ if (!sd->s_element ||
+ (sd->s_type & CONFIGFS_NOT_PINNED))
continue;
if (sd->s_type & CONFIGFS_USET_DEFAULT) {
/* Abort if racing with mkdir() */
@@ -385,8 +445,6 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex
*wait_mutex = &sd->s_dentry->d_inode->i_mutex;
return -EAGAIN;
}
- /* Mark that we're trying to drop the group */
- sd->s_type |= CONFIGFS_USET_DROPPING;
/*
* Yup, recursive. If there's a problem, blame
@@ -414,12 +472,11 @@ static void configfs_detach_rollback(struct dentry *dentry)
struct configfs_dirent *parent_sd = dentry->d_fsdata;
struct configfs_dirent *sd;
- list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
- if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+ parent_sd->s_type &= ~CONFIGFS_USET_DROPPING;
+
+ list_for_each_entry(sd, &parent_sd->s_children, s_sibling)
+ if (sd->s_type & CONFIGFS_USET_DEFAULT)
configfs_detach_rollback(sd->s_dentry);
- sd->s_type &= ~CONFIGFS_USET_DROPPING;
- }
- }
}
static void detach_attrs(struct config_item * item)
@@ -558,36 +615,21 @@ static int create_default_group(struct config_group *parent_group,
static int populate_groups(struct config_group *group)
{
struct config_group *new_group;
- struct dentry *dentry = group->cg_item.ci_dentry;
int ret = 0;
int i;
if (group->default_groups) {
- /*
- * FYI, we're faking mkdir here
- * I'm not sure we need this semaphore, as we're called
- * from our parent's mkdir. That holds our parent's
- * i_mutex, so afaik lookup cannot continue through our
- * parent to find us, let alone mess with our tree.
- * That said, taking our i_mutex is closer to mkdir
- * emulation, and shouldn't hurt.
- */
- mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
-
for (i = 0; group->default_groups[i]; i++) {
new_group = group->default_groups[i];
ret = create_default_group(group, new_group);
- if (ret)
+ if (ret) {
+ detach_groups(group);
break;
+ }
}
-
- mutex_unlock(&dentry->d_inode->i_mutex);
}
- if (ret)
- detach_groups(group);
-
return ret;
}
@@ -702,7 +744,15 @@ static int configfs_attach_item(struct config_item *parent_item,
if (!ret) {
ret = populate_attrs(item);
if (ret) {
+ /*
+ * We are going to remove an inode and its dentry but
+ * the VFS may already have hit and used them. Thus,
+ * we must lock them as rmdir() would.
+ */
+ mutex_lock(&dentry->d_inode->i_mutex);
configfs_remove_dir(item);
+ dentry->d_inode->i_flags |= S_DEAD;
+ mutex_unlock(&dentry->d_inode->i_mutex);
d_delete(dentry);
}
}
@@ -710,6 +760,7 @@ static int configfs_attach_item(struct config_item *parent_item,
return ret;
}
+/* Caller holds the mutex of the item's inode */
static void configfs_detach_item(struct config_item *item)
{
detach_attrs(item);
@@ -728,16 +779,30 @@ static int configfs_attach_group(struct config_item *parent_item,
sd = dentry->d_fsdata;
sd->s_type |= CONFIGFS_USET_DIR;
+ /*
+ * FYI, we're faking mkdir in populate_groups()
+ * We must lock the group's inode to avoid races with the VFS
+ * which can already hit the inode and try to add/remove entries
+ * under it.
+ *
+ * We must also lock the inode to remove it safely in case of
+ * error, as rmdir() would.
+ */
+ mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
ret = populate_groups(to_config_group(item));
if (ret) {
configfs_detach_item(item);
- d_delete(dentry);
+ dentry->d_inode->i_flags |= S_DEAD;
}
+ mutex_unlock(&dentry->d_inode->i_mutex);
+ if (ret)
+ d_delete(dentry);
}
return ret;
}
+/* Caller holds the mutex of the group's inode */
static void configfs_detach_group(struct config_item *item)
{
detach_groups(to_config_group(item));
@@ -1035,7 +1100,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct configfs_subsystem *subsys;
struct configfs_dirent *sd;
struct config_item_type *type;
- struct module *owner = NULL;
+ struct module *subsys_owner = NULL, *new_item_owner = NULL;
char *name;
if (dentry->d_parent == configfs_sb->s_root) {
@@ -1044,6 +1109,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
}
sd = dentry->d_parent->d_fsdata;
+
+ /*
+ * Fake invisibility if dir belongs to a group/default groups hierarchy
+ * being attached
+ */
+ if (!configfs_dirent_is_ready(sd)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
if (!(sd->s_type & CONFIGFS_USET_DIR)) {
ret = -EPERM;
goto out;
@@ -1062,10 +1137,25 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
goto out_put;
}
+ /*
+ * The subsystem may belong to a different module than the item
+ * being created. We don't want to safely pin the new item but
+ * fail to pin the subsystem it sits under.
+ */
+ if (!subsys->su_group.cg_item.ci_type) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+ subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+ if (!try_module_get(subsys_owner)) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
if (!name) {
ret = -ENOMEM;
- goto out_put;
+ goto out_subsys_put;
}
snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
@@ -1094,10 +1184,10 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
kfree(name);
if (ret) {
/*
- * If item == NULL, then link_obj() was never called.
+ * If ret != 0, then link_obj() was never called.
* There are no extra references to clean up.
*/
- goto out_put;
+ goto out_subsys_put;
}
/*
@@ -1111,8 +1201,8 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
goto out_unlink;
}
- owner = type->ct_owner;
- if (!try_module_get(owner)) {
+ new_item_owner = type->ct_owner;
+ if (!try_module_get(new_item_owner)) {
ret = -EINVAL;
goto out_unlink;
}
@@ -1142,6 +1232,8 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
spin_lock(&configfs_dirent_lock);
sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
+ if (!ret)
+ configfs_dir_set_ready(dentry->d_fsdata);
spin_unlock(&configfs_dirent_lock);
out_unlink:
@@ -1159,9 +1251,13 @@ out_unlink:
mutex_unlock(&subsys->su_mutex);
if (module_got)
- module_put(owner);
+ module_put(new_item_owner);
}
+out_subsys_put:
+ if (ret)
+ module_put(subsys_owner);
+
out_put:
/*
* link_obj()/link_group() took a reference from child->parent,
@@ -1180,7 +1276,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
struct config_item *item;
struct configfs_subsystem *subsys;
struct configfs_dirent *sd;
- struct module *owner = NULL;
+ struct module *subsys_owner = NULL, *dead_item_owner = NULL;
int ret;
if (dentry->d_parent == configfs_sb->s_root)
@@ -1207,6 +1303,15 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
return -EINVAL;
}
+ /* configfs_mkdir() shouldn't have allowed this */
+ BUG_ON(!subsys->su_group.cg_item.ci_type);
+ subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+
+ /*
+ * Ensure that no racing symlink() will make detach_prep() fail while
+ * the new link is temporarily attached
+ */
+ mutex_lock(&configfs_symlink_mutex);
spin_lock(&configfs_dirent_lock);
do {
struct mutex *wait_mutex;
@@ -1215,6 +1320,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
if (ret) {
configfs_detach_rollback(dentry);
spin_unlock(&configfs_dirent_lock);
+ mutex_unlock(&configfs_symlink_mutex);
if (ret != -EAGAIN) {
config_item_put(parent_item);
return ret;
@@ -1224,10 +1330,12 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
mutex_lock(wait_mutex);
mutex_unlock(wait_mutex);
+ mutex_lock(&configfs_symlink_mutex);
spin_lock(&configfs_dirent_lock);
}
} while (ret == -EAGAIN);
spin_unlock(&configfs_dirent_lock);
+ mutex_unlock(&configfs_symlink_mutex);
/* Get a working ref for the duration of this function */
item = configfs_get_config_item(dentry);
@@ -1236,7 +1344,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
config_item_put(parent_item);
if (item->ci_type)
- owner = item->ci_type->ct_owner;
+ dead_item_owner = item->ci_type->ct_owner;
if (sd->s_type & CONFIGFS_USET_DIR) {
configfs_detach_group(item);
@@ -1258,7 +1366,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
/* Drop our reference from above */
config_item_put(item);
- module_put(owner);
+ module_put(dead_item_owner);
+ module_put(subsys_owner);
return 0;
}
@@ -1314,13 +1423,24 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
{
struct dentry * dentry = file->f_path.dentry;
struct configfs_dirent * parent_sd = dentry->d_fsdata;
+ int err;
mutex_lock(&dentry->d_inode->i_mutex);
- file->private_data = configfs_new_dirent(parent_sd, NULL);
+ /*
+ * Fake invisibility if dir belongs to a group/default groups hierarchy
+ * being attached
+ */
+ err = -ENOENT;
+ if (configfs_dirent_is_ready(parent_sd)) {
+ file->private_data = configfs_new_dirent(parent_sd, NULL);
+ if (IS_ERR(file->private_data))
+ err = PTR_ERR(file->private_data);
+ else
+ err = 0;
+ }
mutex_unlock(&dentry->d_inode->i_mutex);
- return IS_ERR(file->private_data) ? PTR_ERR(file->private_data) : 0;
-
+ return err;
}
static int configfs_dir_close(struct inode *inode, struct file *file)
@@ -1491,6 +1611,10 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
if (err) {
d_delete(dentry);
dput(dentry);
+ } else {
+ spin_lock(&configfs_dirent_lock);
+ configfs_dir_set_ready(dentry->d_fsdata);
+ spin_unlock(&configfs_dirent_lock);
}
}
@@ -1517,11 +1641,13 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
I_MUTEX_PARENT);
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+ mutex_lock(&configfs_symlink_mutex);
spin_lock(&configfs_dirent_lock);
if (configfs_detach_prep(dentry, NULL)) {
printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
}
spin_unlock(&configfs_dirent_lock);
+ mutex_unlock(&configfs_symlink_mutex);
configfs_detach_group(&group->cg_item);
dentry->d_inode->i_flags |= S_DEAD;
mutex_unlock(&dentry->d_inode->i_mutex);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 0004d18c40ac..bf74973b0492 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -31,6 +31,9 @@
#include <linux/configfs.h>
#include "configfs_internal.h"
+/* Protects attachments of new symlinks */
+DEFINE_MUTEX(configfs_symlink_mutex);
+
static int item_depth(struct config_item * item)
{
struct config_item * p = item;
@@ -73,11 +76,20 @@ static int create_link(struct config_item *parent_item,
struct configfs_symlink *sl;
int ret;
+ ret = -ENOENT;
+ if (!configfs_dirent_is_ready(target_sd))
+ goto out;
ret = -ENOMEM;
sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
if (sl) {
sl->sl_target = config_item_get(item);
spin_lock(&configfs_dirent_lock);
+ if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
+ spin_unlock(&configfs_dirent_lock);
+ config_item_put(item);
+ kfree(sl);
+ return -ENOENT;
+ }
list_add(&sl->sl_list, &target_sd->s_links);
spin_unlock(&configfs_dirent_lock);
ret = configfs_create_link(sl, parent_item->ci_dentry,
@@ -91,6 +103,7 @@ static int create_link(struct config_item *parent_item,
}
}
+out:
return ret;
}
@@ -120,6 +133,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
{
int ret;
struct nameidata nd;
+ struct configfs_dirent *sd;
struct config_item *parent_item;
struct config_item *target_item;
struct config_item_type *type;
@@ -128,9 +142,19 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
if (dentry->d_parent == configfs_sb->s_root)
goto out;
+ sd = dentry->d_parent->d_fsdata;
+ /*
+ * Fake invisibility if dir belongs to a group/default groups hierarchy
+ * being attached
+ */
+ ret = -ENOENT;
+ if (!configfs_dirent_is_ready(sd))
+ goto out;
+
parent_item = configfs_get_config_item(dentry->d_parent);
type = parent_item->ci_type;
+ ret = -EPERM;
if (!type || !type->ct_item_ops ||
!type->ct_item_ops->allow_link)
goto out_put;
@@ -141,7 +165,9 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
ret = type->ct_item_ops->allow_link(parent_item, target_item);
if (!ret) {
+ mutex_lock(&configfs_symlink_mutex);
ret = create_link(parent_item, target_item, dentry);
+ mutex_unlock(&configfs_symlink_mutex);
if (ret && type->ct_item_ops->drop_link)
type->ct_item_ops->drop_link(parent_item,
target_item);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1db080135c6d..506c24fb5078 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1073,12 +1073,15 @@ static void ocfs2_write_failure(struct inode *inode,
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
- if (ocfs2_should_order_data(inode))
- walk_page_buffers(wc->w_handle, page_buffers(tmppage),
- from, to, NULL,
- ocfs2_journal_dirty_data);
-
- block_commit_write(tmppage, from, to);
+ if (page_has_buffers(tmppage)) {
+ if (ocfs2_should_order_data(inode))
+ walk_page_buffers(wc->w_handle,
+ page_buffers(tmppage),
+ from, to, NULL,
+ ocfs2_journal_dirty_data);
+
+ block_commit_write(tmppage, from, to);
+ }
}
}
@@ -1901,12 +1904,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
to = PAGE_CACHE_SIZE;
}
- if (ocfs2_should_order_data(inode))
- walk_page_buffers(wc->w_handle, page_buffers(tmppage),
- from, to, NULL,
- ocfs2_journal_dirty_data);
-
- block_commit_write(tmppage, from, to);
+ if (page_has_buffers(tmppage)) {
+ if (ocfs2_should_order_data(inode))
+ walk_page_buffers(wc->w_handle,
+ page_buffers(tmppage),
+ from, to, NULL,
+ ocfs2_journal_dirty_data);
+ block_commit_write(tmppage, from, to);
+ }
}
out_write_size:
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index be2dd95d3a1d..ec2ed15c3daa 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1766,8 +1766,8 @@ out_inode_unlock:
out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
- mutex_unlock(&inode->i_mutex);
out:
+ mutex_unlock(&inode->i_mutex);
return ret;
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a8c19cb3cfdd..7a37240f7a31 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -57,7 +57,7 @@ static int __ocfs2_recovery_thread(void *arg);
static int ocfs2_commit_cache(struct ocfs2_super *osb);
static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
- int dirty);
+ int dirty, int replayed);
static int ocfs2_trylock_journal(struct ocfs2_super *osb,
int slot_num);
static int ocfs2_recover_orphans(struct ocfs2_super *osb,
@@ -562,8 +562,18 @@ done:
return status;
}
+static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
+{
+ le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
+}
+
+static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
+{
+ return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
+}
+
static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
- int dirty)
+ int dirty, int replayed)
{
int status;
unsigned int flags;
@@ -593,6 +603,9 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
flags &= ~OCFS2_JOURNAL_DIRTY_FL;
fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+ if (replayed)
+ ocfs2_bump_recovery_generation(fe);
+
status = ocfs2_write_block(osb, bh, journal->j_inode);
if (status < 0)
mlog_errno(status);
@@ -667,7 +680,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
* Do not toggle if flush was unsuccessful otherwise
* will leave dirty metadata in a "clean" journal
*/
- status = ocfs2_journal_toggle_dirty(osb, 0);
+ status = ocfs2_journal_toggle_dirty(osb, 0, 0);
if (status < 0)
mlog_errno(status);
}
@@ -710,7 +723,7 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
}
}
-int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
{
int status = 0;
struct ocfs2_super *osb;
@@ -729,7 +742,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
- status = ocfs2_journal_toggle_dirty(osb, 1);
+ status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
if (status < 0) {
mlog_errno(status);
goto done;
@@ -771,7 +784,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
goto bail;
}
- status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+ status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
if (status < 0)
mlog_errno(status);
@@ -1034,6 +1047,12 @@ restart:
spin_unlock(&osb->osb_lock);
mlog(0, "All nodes recovered\n");
+ /* Refresh all journal recovery generations from disk */
+ status = ocfs2_check_journals_nolocks(osb);
+ status = (status == -EROFS) ? 0 : status;
+ if (status < 0)
+ mlog_errno(status);
+
ocfs2_super_unlock(osb, 1);
/* We always run recovery on our own orphan dir - the dead
@@ -1096,6 +1115,42 @@ out:
mlog_exit_void();
}
+static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
+ int slot_num,
+ struct buffer_head **bh,
+ struct inode **ret_inode)
+{
+ int status = -EACCES;
+ struct inode *inode = NULL;
+
+ BUG_ON(slot_num >= osb->max_slots);
+
+ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+ slot_num);
+ if (!inode || is_bad_inode(inode)) {
+ mlog_errno(status);
+ goto bail;
+ }
+ SET_INODE_JOURNAL(inode);
+
+ status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = 0;
+
+bail:
+ if (inode) {
+ if (status || !ret_inode)
+ iput(inode);
+ else
+ *ret_inode = inode;
+ }
+ return status;
+}
+
/* Does the actual journal replay and marks the journal inode as
* clean. Will only replay if the journal inode is marked dirty. */
static int ocfs2_replay_journal(struct ocfs2_super *osb,
@@ -1109,22 +1164,36 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
struct ocfs2_dinode *fe;
journal_t *journal = NULL;
struct buffer_head *bh = NULL;
+ u32 slot_reco_gen;
- inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
- slot_num);
- if (inode == NULL) {
- status = -EACCES;
+ status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
+ if (status) {
mlog_errno(status);
goto done;
}
- if (is_bad_inode(inode)) {
- status = -EACCES;
- iput(inode);
- inode = NULL;
- mlog_errno(status);
+
+ fe = (struct ocfs2_dinode *)bh->b_data;
+ slot_reco_gen = ocfs2_get_recovery_generation(fe);
+ brelse(bh);
+ bh = NULL;
+
+ /*
+ * As the fs recovery is asynchronous, there is a small chance that
+ * another node mounted (and recovered) the slot before the recovery
+ * thread could get the lock. To handle that, we dirty read the journal
+ * inode for that slot to get the recovery generation. If it is
+ * different than what we expected, the slot has been recovered.
+ * If not, it needs recovery.
+ */
+ if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
+ mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+ osb->slot_recovery_generations[slot_num], slot_reco_gen);
+ osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+ status = -EBUSY;
goto done;
}
- SET_INODE_JOURNAL(inode);
+
+ /* Continue with recovery as the journal has not yet been recovered */
status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
if (status < 0) {
@@ -1138,9 +1207,12 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
fe = (struct ocfs2_dinode *) bh->b_data;
flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+ slot_reco_gen = ocfs2_get_recovery_generation(fe);
if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
mlog(0, "No recovery required for node %d\n", node_num);
+ /* Refresh recovery generation for the slot */
+ osb->slot_recovery_generations[slot_num] = slot_reco_gen;
goto done;
}
@@ -1188,6 +1260,11 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
flags &= ~OCFS2_JOURNAL_DIRTY_FL;
fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+ /* Increment recovery generation to indicate successful recovery */
+ ocfs2_bump_recovery_generation(fe);
+ osb->slot_recovery_generations[slot_num] =
+ ocfs2_get_recovery_generation(fe);
+
status = ocfs2_write_block(osb, bh, inode);
if (status < 0)
mlog_errno(status);
@@ -1252,6 +1329,13 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
status = ocfs2_replay_journal(osb, node_num, slot_num);
if (status < 0) {
+ if (status == -EBUSY) {
+ mlog(0, "Skipping recovery for slot %u (node %u) "
+ "as another node has recovered it\n", slot_num,
+ node_num);
+ status = 0;
+ goto done;
+ }
mlog_errno(status);
goto done;
}
@@ -1334,12 +1418,29 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
{
unsigned int node_num;
int status, i;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_dinode *di;
/* This is called with the super block cluster lock, so we
* know that the slot map can't change underneath us. */
spin_lock(&osb->osb_lock);
for (i = 0; i < osb->max_slots; i++) {
+ /* Read journal inode to get the recovery generation */
+ status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ di = (struct ocfs2_dinode *)bh->b_data;
+ osb->slot_recovery_generations[i] =
+ ocfs2_get_recovery_generation(di);
+ brelse(bh);
+ bh = NULL;
+
+ mlog(0, "Slot %u recovery generation is %u\n", i,
+ osb->slot_recovery_generations[i]);
+
if (i == osb->slot_num)
continue;
@@ -1603,49 +1704,41 @@ static int ocfs2_commit_thread(void *arg)
return 0;
}
-/* Look for a dirty journal without taking any cluster locks. Used for
- * hard readonly access to determine whether the file system journals
- * require recovery. */
+/* Reads all the journal inodes without taking any cluster locks. Used
+ * for hard readonly access to determine whether any journal requires
+ * recovery. Also used to refresh the recovery generation numbers after
+ * a journal has been recovered by another node.
+ */
int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
{
int ret = 0;
unsigned int slot;
- struct buffer_head *di_bh;
+ struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
- struct inode *journal = NULL;
+ int journal_dirty = 0;
for(slot = 0; slot < osb->max_slots; slot++) {
- journal = ocfs2_get_system_file_inode(osb,
- JOURNAL_SYSTEM_INODE,
- slot);
- if (!journal || is_bad_inode(journal)) {
- ret = -EACCES;
- mlog_errno(ret);
- goto out;
- }
-
- di_bh = NULL;
- ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
- 0, journal);
- if (ret < 0) {
+ ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
+ if (ret) {
mlog_errno(ret);
goto out;
}
di = (struct ocfs2_dinode *) di_bh->b_data;
+ osb->slot_recovery_generations[slot] =
+ ocfs2_get_recovery_generation(di);
+
if (le32_to_cpu(di->id1.journal1.ij_flags) &
OCFS2_JOURNAL_DIRTY_FL)
- ret = -EROFS;
+ journal_dirty = 1;
brelse(di_bh);
- if (ret)
- break;
+ di_bh = NULL;
}
out:
- if (journal)
- iput(journal);
-
+ if (journal_dirty)
+ ret = -EROFS;
return ret;
}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index db82be2532ed..2178ebffa05f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -161,7 +161,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal,
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
int full);
-int ocfs2_journal_load(struct ocfs2_journal *journal, int local);
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local,
+ int replayed);
int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
void ocfs2_recovery_thread(struct ocfs2_super *osb,
int node_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1cb814be8ef1..7f625f2b1117 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -204,6 +204,8 @@ struct ocfs2_super
struct ocfs2_slot_info *slot_info;
+ u32 *slot_recovery_generations;
+
spinlock_t node_map_lock;
u64 root_blkno;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3f1945177629..4f619850ccf7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -660,7 +660,10 @@ struct ocfs2_dinode {
struct { /* Info for journal system
inodes */
__le32 ij_flags; /* Mounted, version, etc. */
- __le32 ij_pad;
+ __le32 ij_recovery_generation; /* Incremented when the
+ journal is recovered
+ after an unclean
+ shutdown */
} journal1;
} id1; /* Inode type dependant 1 */
/*C0*/ union {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2560b33889aa..88255d3f52b4 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1442,6 +1442,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+ osb->slot_recovery_generations =
+ kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
+ GFP_KERNEL);
+ if (!osb->slot_recovery_generations) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
init_waitqueue_head(&osb->osb_wipe_event);
osb->osb_orphan_wipes = kcalloc(osb->max_slots,
sizeof(*osb->osb_orphan_wipes),
@@ -1703,7 +1712,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
local = ocfs2_mount_local(osb);
/* will play back anything left in the journal. */
- status = ocfs2_journal_load(osb->journal, local);
+ status = ocfs2_journal_load(osb->journal, local, dirty);
if (status < 0) {
mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
goto finally;
@@ -1768,6 +1777,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
+ kfree(osb->slot_recovery_generations);
/* FIXME
* This belongs in journal shutdown, but because we have to
* allocate osb->journal at the start of ocfs2_initalize_osb(),