diff options
author | David S. Miller <davem@davemloft.net> | 2016-10-30 12:42:58 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2016-10-30 12:42:58 -0400 |
commit | 27058af401e49d88a905df000dd26f443fcfa8ce (patch) | |
tree | 819f32113d3b8374b9fbf72e2202d4c4d4511a60 /fs | |
parent | 357f4aae859b5d74554b0ccbb18556f1df4166c3 (diff) | |
parent | 2a26d99b251b8625d27aed14e97fc10707a3a81f (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Mostly simple overlapping changes.
For example, David Ahern's adjacency list revamp in 'net-next'
conflicted with an adjacency list traversal bug fix in 'net'.
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'fs')
99 files changed, 2368 insertions, 1719 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 2037e7a77a37..d764236072b1 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -91,11 +91,9 @@ static const struct afs_call_type afs_SRXCBTellMeAboutYourself = { */ bool afs_cm_incoming_call(struct afs_call *call) { - u32 operation_id = ntohl(call->operation_ID); + _enter("{CB.OP %u}", call->operation_ID); - _enter("{CB.OP %u}", operation_id); - - switch (operation_id) { + switch (call->operation_ID) { case CBCallBack: call->type = &afs_SRXCBCallBack; return true; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 96f4d764d1a6..31c616ab9b40 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -364,7 +364,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) buffer = kmap(page); ret = afs_extract_data(call, buffer, call->count, true); - kunmap(buffer); + kunmap(page); if (ret < 0) return ret; } @@ -397,7 +397,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) page = call->reply3; buffer = kmap(page); memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap(buffer); + kunmap(page); } _leave(" = 0 [done]"); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 5497c8496055..535a38d2c1d0 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -112,7 +112,7 @@ struct afs_call { bool need_attention; /* T if RxRPC poked us */ u16 service_id; /* RxRPC service ID to call */ __be16 port; /* target UDP port */ - __be32 operation_ID; /* operation ID for an incoming call */ + u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ __be32 tmp; /* place to extract temporary data */ afs_dataversion_t store_version; /* updated version expected from store */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 477928b25940..25f05a8d21b1 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -676,10 +676,11 @@ static int afs_deliver_cm_op_id(struct afs_call *call) ASSERTCMP(call->offset, <, 4); /* the operation ID forms the first four bytes of the request data */ - ret = afs_extract_data(call, &call->operation_ID, 4, true); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; + call->operation_ID = ntohl(call->tmp); call->state = AFS_CALL_AWAIT_REQUEST; call->offset = 0; diff --git a/fs/befs/befs.h b/fs/befs/befs.h index e0f59263a96d..c6bad51d8ec7 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -43,7 +43,10 @@ struct befs_sb_info { u32 ag_shift; u32 num_ags; - /* jornal log entry */ + /* State of the superblock */ + u32 flags; + + /* Journal log entry */ befs_block_run log_blocks; befs_off_t log_start; befs_off_t log_end; @@ -79,7 +82,7 @@ enum befs_err { BEFS_BT_END, BEFS_BT_EMPTY, BEFS_BT_MATCH, - BEFS_BT_PARMATCH, + BEFS_BT_OVERFLOW, BEFS_BT_NOT_FOUND }; @@ -140,18 +143,6 @@ befs_iaddrs_per_block(struct super_block *sb) return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr); } -static inline int -befs_iaddr_is_empty(const befs_inode_addr *iaddr) -{ - return (!iaddr->allocation_group) && (!iaddr->start) && (!iaddr->len); -} - -static inline size_t -befs_brun_size(struct super_block *sb, befs_block_run run) -{ - return BEFS_SB(sb)->block_size * run.len; -} - #include "endian.h" #endif /* _LINUX_BEFS_H */ diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 307645f9e284..7e135ea73fdd 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -85,7 +85,7 @@ struct befs_btree_node { }; /* local constants */ -static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL; +static const befs_off_t BEFS_BT_INVAL = 0xffffffffffffffffULL; /* local functions */ static int befs_btree_seekleaf(struct super_block *sb, const befs_data_stream *ds, @@ -156,8 +156,6 @@ befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds, sup->max_depth = fs32_to_cpu(sb, od_sup->max_depth); sup->data_type = fs32_to_cpu(sb, od_sup->data_type); sup->root_node_ptr = fs64_to_cpu(sb, od_sup->root_node_ptr); - sup->free_node_ptr = fs64_to_cpu(sb, od_sup->free_node_ptr); - sup->max_size = fs64_to_cpu(sb, od_sup->max_size); brelse(bh); if (sup->magic != BEFS_BTREE_MAGIC) { @@ -183,8 +181,8 @@ befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds, * Calls befs_read_datastream to read in the indicated btree node and * makes sure its header fields are in cpu byteorder, byteswapping if * necessary. - * Note: node->bh must be NULL when this function called first - * time. Don't forget brelse(node->bh) after last call. + * Note: node->bh must be NULL when this function is called the first time. + * Don't forget brelse(node->bh) after last call. * * On success, returns BEFS_OK and *@node contains the btree node that * starts at @node_off, with the node->head fields in cpu byte order. @@ -244,7 +242,7 @@ befs_bt_read_node(struct super_block *sb, const befs_data_stream *ds, * Read the superblock and rootnode of the b+tree. * Drill down through the interior nodes using befs_find_key(). * Once at the correct leaf node, use befs_find_key() again to get the - * actuall value stored with the key. + * actual value stored with the key. */ int befs_btree_find(struct super_block *sb, const befs_data_stream *ds, @@ -283,9 +281,9 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds, while (!befs_leafnode(this_node)) { res = befs_find_key(sb, this_node, key, &node_off); - if (res == BEFS_BT_NOT_FOUND) + /* if no key set, try the overflow node */ + if (res == BEFS_BT_OVERFLOW) node_off = this_node->head.overflow; - /* if no match, go to overflow node */ if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { befs_error(sb, "befs_btree_find() failed to read " "node at %llu", node_off); @@ -293,15 +291,15 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds, } } - /* at the correct leaf node now */ - + /* at a leaf node now, check if it is correct */ res = befs_find_key(sb, this_node, key, value); brelse(this_node->bh); kfree(this_node); if (res != BEFS_BT_MATCH) { - befs_debug(sb, "<--- %s Key %s not found", __func__, key); + befs_error(sb, "<--- %s Key %s not found", __func__, key); + befs_debug(sb, "<--- %s ERROR", __func__); *value = 0; return BEFS_BT_NOT_FOUND; } @@ -324,16 +322,12 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds, * @findkey: Keystring to search for * @value: If key is found, the value stored with the key is put here * - * finds exact match if one exists, and returns BEFS_BT_MATCH - * If no exact match, finds first key in node that is greater - * (alphabetically) than the search key and returns BEFS_BT_PARMATCH - * (for partial match, I guess). Can you think of something better to - * call it? - * - * If no key was a match or greater than the search key, return - * BEFS_BT_NOT_FOUND. + * Finds exact match if one exists, and returns BEFS_BT_MATCH. + * If there is no match and node's value array is too small for key, return + * BEFS_BT_OVERFLOW. + * If no match and node should countain this key, return BEFS_BT_NOT_FOUND. * - * Use binary search instead of a linear. + * Uses binary search instead of a linear. */ static int befs_find_key(struct super_block *sb, struct befs_btree_node *node, @@ -348,18 +342,16 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node, befs_debug(sb, "---> %s %s", __func__, findkey); - *value = 0; - findkey_len = strlen(findkey); - /* if node can not contain key, just skeep this node */ + /* if node can not contain key, just skip this node */ last = node->head.all_key_count - 1; thiskey = befs_bt_get_key(sb, node, last, &keylen); eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len); if (eq < 0) { - befs_debug(sb, "<--- %s %s not found", __func__, findkey); - return BEFS_BT_NOT_FOUND; + befs_debug(sb, "<--- node can't contain %s", findkey); + return BEFS_BT_OVERFLOW; } valarray = befs_bt_valarray(node); @@ -387,12 +379,15 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node, else first = mid + 1; } + + /* return an existing value so caller can arrive to a leaf node */ if (eq < 0) *value = fs64_to_cpu(sb, valarray[mid + 1]); else *value = fs64_to_cpu(sb, valarray[mid]); - befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid); - return BEFS_BT_PARMATCH; + befs_error(sb, "<--- %s %s not found", __func__, findkey); + befs_debug(sb, "<--- %s ERROR", __func__); + return BEFS_BT_NOT_FOUND; } /** @@ -405,7 +400,7 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node, * @keysize: Length of the returned key * @value: Value stored with the returned key * - * Heres how it works: Key_no is the index of the key/value pair to + * Here's how it works: Key_no is the index of the key/value pair to * return in keybuf/value. * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is * the number of characters in the key (just a convenience). @@ -422,7 +417,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds, { struct befs_btree_node *this_node; befs_btree_super bt_super; - befs_off_t node_off = 0; + befs_off_t node_off; int cur_key; fs64 *valarray; char *keystart; @@ -467,7 +462,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds, while (key_sum + this_node->head.all_key_count <= key_no) { /* no more nodes to look in: key_no is too large */ - if (this_node->head.right == befs_bt_inval) { + if (this_node->head.right == BEFS_BT_INVAL) { *keysize = 0; *value = 0; befs_debug(sb, @@ -541,7 +536,6 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds, * @node_off: Pointer to offset of current node within datastream. Modified * by the function. * - * * Helper function for btree traverse. Moves the current position to the * start of the first leaf node. * @@ -608,7 +602,7 @@ static int befs_leafnode(struct befs_btree_node *node) { /* all interior nodes (and only interior nodes) have an overflow node */ - if (node->head.overflow == befs_bt_inval) + if (node->head.overflow == BEFS_BT_INVAL) return 1; else return 0; @@ -715,7 +709,7 @@ befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node, * * Returns 0 if @key1 and @key2 are equal. * Returns >0 if @key1 is greater. - * Returns <0 if @key2 is greater.. + * Returns <0 if @key2 is greater. */ static int befs_compare_strings(const void *key1, int keylen1, diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index af1bc19b7c85..b4c7ba013c0d 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -22,22 +22,22 @@ const befs_inode_addr BAD_IADDR = { 0, 0, 0 }; static int befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data, - befs_blocknr_t blockno, befs_block_run * run); + befs_blocknr_t blockno, befs_block_run *run); static int befs_find_brun_indirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run); + befs_block_run *run); static int befs_find_brun_dblindirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run); + befs_block_run *run); /** * befs_read_datastream - get buffer_head containing data, starting from pos. * @sb: Filesystem superblock - * @ds: datastrem to find data with + * @ds: datastream to find data with * @pos: start of data * @off: offset of data in buffer_head->b_data * @@ -46,7 +46,7 @@ static int befs_find_brun_dblindirect(struct super_block *sb, */ struct buffer_head * befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, - befs_off_t pos, uint * off) + befs_off_t pos, uint *off) { struct buffer_head *bh; befs_block_run run; @@ -75,7 +75,13 @@ befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, return bh; } -/* +/** + * befs_fblock2brun - give back block run for fblock + * @sb: the superblock + * @data: datastream to read from + * @fblock: the blocknumber with the file position to find + * @run: The found run is passed back through this pointer + * * Takes a file position and gives back a brun who's starting block * is block number fblock of the file. * @@ -88,7 +94,7 @@ befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, */ int befs_fblock2brun(struct super_block *sb, const befs_data_stream *data, - befs_blocknr_t fblock, befs_block_run * run) + befs_blocknr_t fblock, befs_block_run *run) { int err; befs_off_t pos = fblock << BEFS_SB(sb)->block_shift; @@ -115,7 +121,7 @@ befs_fblock2brun(struct super_block *sb, const befs_data_stream *data, /** * befs_read_lsmylink - read long symlink from datastream. * @sb: Filesystem superblock - * @ds: Datastrem to read from + * @ds: Datastream to read from * @buff: Buffer in which to place long symlink data * @len: Length of the long symlink in bytes * @@ -128,6 +134,7 @@ befs_read_lsymlink(struct super_block *sb, const befs_data_stream *ds, befs_off_t bytes_read = 0; /* bytes readed */ u16 plen; struct buffer_head *bh; + befs_debug(sb, "---> %s length: %llu", __func__, len); while (bytes_read < len) { @@ -183,13 +190,13 @@ befs_count_blocks(struct super_block *sb, const befs_data_stream *ds) metablocks += ds->indirect.len; /* - Double indir block, plus all the indirect blocks it mapps - In the double-indirect range, all block runs of data are - BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know - how many data block runs are in the double-indirect region, - and from that we know how many indirect blocks it takes to - map them. We assume that the indirect blocks are also - BEFS_DBLINDIR_BRUN_LEN blocks long. + * Double indir block, plus all the indirect blocks it maps. + * In the double-indirect range, all block runs of data are + * BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know + * how many data block runs are in the double-indirect region, + * and from that we know how many indirect blocks it takes to + * map them. We assume that the indirect blocks are also + * BEFS_DBLINDIR_BRUN_LEN blocks long. */ if (ds->size > ds->max_indirect_range && ds->max_indirect_range != 0) { uint dbl_bytes; @@ -212,58 +219,50 @@ befs_count_blocks(struct super_block *sb, const befs_data_stream *ds) return blocks; } -/* - Finds the block run that starts at file block number blockno - in the file represented by the datastream data, if that - blockno is in the direct region of the datastream. - - sb: the superblock - data: the datastream - blockno: the blocknumber to find - run: The found run is passed back through this pointer - - Return value is BEFS_OK if the blockrun is found, BEFS_ERR - otherwise. - - Algorithm: - Linear search. Checks each element of array[] to see if it - contains the blockno-th filesystem block. This is necessary - because the block runs map variable amounts of data. Simply - keeps a count of the number of blocks searched so far (sum), - incrementing this by the length of each block run as we come - across it. Adds sum to *count before returning (this is so - you can search multiple arrays that are logicaly one array, - as in the indirect region code). - - When/if blockno is found, if blockno is inside of a block - run as stored on disk, we offset the start and length members - of the block run, so that blockno is the start and len is - still valid (the run ends in the same place). - - 2001-11-15 Will Dyson -*/ +/** + * befs_find_brun_direct - find a direct block run in the datastream + * @sb: the superblock + * @data: the datastream + * @blockno: the blocknumber to find + * @run: The found run is passed back through this pointer + * + * Finds the block run that starts at file block number blockno + * in the file represented by the datastream data, if that + * blockno is in the direct region of the datastream. + * + * Return value is BEFS_OK if the blockrun is found, BEFS_ERR + * otherwise. + * + * Algorithm: + * Linear search. Checks each element of array[] to see if it + * contains the blockno-th filesystem block. This is necessary + * because the block runs map variable amounts of data. Simply + * keeps a count of the number of blocks searched so far (sum), + * incrementing this by the length of each block run as we come + * across it. Adds sum to *count before returning (this is so + * you can search multiple arrays that are logicaly one array, + * as in the indirect region code). + * + * When/if blockno is found, if blockno is inside of a block + * run as stored on disk, we offset the start and length members + * of the block run, so that blockno is the start and len is + * still valid (the run ends in the same place). + */ static int befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data, - befs_blocknr_t blockno, befs_block_run * run) + befs_blocknr_t blockno, befs_block_run *run) { int i; const befs_block_run *array = data->direct; befs_blocknr_t sum; - befs_blocknr_t max_block = - data->max_direct_range >> BEFS_SB(sb)->block_shift; befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno); - if (blockno > max_block) { - befs_error(sb, "%s passed block outside of direct region", - __func__); - return BEFS_ERR; - } - for (i = 0, sum = 0; i < BEFS_NUM_DIRECT_BLOCKS; sum += array[i].len, i++) { if (blockno >= sum && blockno < sum + (array[i].len)) { int offset = blockno - sum; + run->allocation_group = array[i].allocation_group; run->start = array[i].start + offset; run->len = array[i].len - offset; @@ -275,38 +274,39 @@ befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data, } } + befs_error(sb, "%s failed to find file block %lu", __func__, + (unsigned long)blockno); befs_debug(sb, "---> %s ERROR", __func__); return BEFS_ERR; } -/* - Finds the block run that starts at file block number blockno - in the file represented by the datastream data, if that - blockno is in the indirect region of the datastream. - - sb: the superblock - data: the datastream - blockno: the blocknumber to find - run: The found run is passed back through this pointer - - Return value is BEFS_OK if the blockrun is found, BEFS_ERR - otherwise. - - Algorithm: - For each block in the indirect run of the datastream, read - it in and search through it for search_blk. - - XXX: - Really should check to make sure blockno is inside indirect - region. - - 2001-11-15 Will Dyson -*/ +/** + * befs_find_brun_indirect - find a block run in the datastream + * @sb: the superblock + * @data: the datastream + * @blockno: the blocknumber to find + * @run: The found run is passed back through this pointer + * + * Finds the block run that starts at file block number blockno + * in the file represented by the datastream data, if that + * blockno is in the indirect region of the datastream. + * + * Return value is BEFS_OK if the blockrun is found, BEFS_ERR + * otherwise. + * + * Algorithm: + * For each block in the indirect run of the datastream, read + * it in and search through it for search_blk. + * + * XXX: + * Really should check to make sure blockno is inside indirect + * region. + */ static int befs_find_brun_indirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run) + befs_block_run *run) { int i, j; befs_blocknr_t sum = 0; @@ -326,11 +326,12 @@ befs_find_brun_indirect(struct super_block *sb, /* Examine blocks of the indirect run one at a time */ for (i = 0; i < indirect.len; i++) { - indirblock = befs_bread(sb, indirblockno + i); + indirblock = sb_bread(sb, indirblockno + i); if (indirblock == NULL) { - befs_debug(sb, "---> %s failed to read " + befs_error(sb, "---> %s failed to read " "disk block %lu from the indirect brun", __func__, (unsigned long)indirblockno + i); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -370,52 +371,51 @@ befs_find_brun_indirect(struct super_block *sb, return BEFS_ERR; } -/* - Finds the block run that starts at file block number blockno - in the file represented by the datastream data, if that - blockno is in the double-indirect region of the datastream. - - sb: the superblock - data: the datastream - blockno: the blocknumber to find - run: The found run is passed back through this pointer - - Return value is BEFS_OK if the blockrun is found, BEFS_ERR - otherwise. - - Algorithm: - The block runs in the double-indirect region are different. - They are always allocated 4 fs blocks at a time, so each - block run maps a constant amount of file data. This means - that we can directly calculate how many block runs into the - double-indirect region we need to go to get to the one that - maps a particular filesystem block. - - We do this in two stages. First we calculate which of the - inode addresses in the double-indirect block will point us - to the indirect block that contains the mapping for the data, - then we calculate which of the inode addresses in that - indirect block maps the data block we are after. - - Oh, and once we've done that, we actually read in the blocks - that contain the inode addresses we calculated above. Even - though the double-indirect run may be several blocks long, - we can calculate which of those blocks will contain the index - we are after and only read that one. We then follow it to - the indirect block and perform a similar process to find - the actual block run that maps the data block we are interested - in. - - Then we offset the run as in befs_find_brun_array() and we are - done. - - 2001-11-15 Will Dyson -*/ +/** + * befs_find_brun_dblindirect - find a block run in the datastream + * @sb: the superblock + * @data: the datastream + * @blockno: the blocknumber to find + * @run: The found run is passed back through this pointer + * + * Finds the block run that starts at file block number blockno + * in the file represented by the datastream data, if that + * blockno is in the double-indirect region of the datastream. + * + * Return value is BEFS_OK if the blockrun is found, BEFS_ERR + * otherwise. + * + * Algorithm: + * The block runs in the double-indirect region are different. + * They are always allocated 4 fs blocks at a time, so each + * block run maps a constant amount of file data. This means + * that we can directly calculate how many block runs into the + * double-indirect region we need to go to get to the one that + * maps a particular filesystem block. + * + * We do this in two stages. First we calculate which of the + * inode addresses in the double-indirect block will point us + * to the indirect block that contains the mapping for the data, + * then we calculate which of the inode addresses in that + * indirect block maps the data block we are after. + * + * Oh, and once we've done that, we actually read in the blocks + * that contain the inode addresses we calculated above. Even + * though the double-indirect run may be several blocks long, + * we can calculate which of those blocks will contain the index + * we are after and only read that one. We then follow it to + * the indirect block and perform a similar process to find + * the actual block run that maps the data block we are interested + * in. + * + * Then we offset the run as in befs_find_brun_array() and we are + * done. + */ static int befs_find_brun_dblindirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run) + befs_block_run *run) { int dblindir_indx; int indir_indx; @@ -430,10 +430,9 @@ befs_find_brun_dblindirect(struct super_block *sb, struct buffer_head *indir_block; befs_block_run indir_run; befs_disk_inode_addr *iaddr_array; - struct befs_sb_info *befs_sb = BEFS_SB(sb); befs_blocknr_t indir_start_blk = - data->max_indirect_range >> befs_sb->block_shift; + data->max_indirect_range >> BEFS_SB(sb)->block_shift; off_t dbl_indir_off = blockno - indir_start_blk; @@ -471,7 +470,7 @@ befs_find_brun_dblindirect(struct super_block *sb, } dbl_indir_block = - befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + + sb_bread(sb, iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); if (dbl_indir_block == NULL) { befs_error(sb, "%s couldn't read the " @@ -479,7 +478,6 @@ befs_find_brun_dblindirect(struct super_block *sb, (unsigned long) iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); - brelse(dbl_indir_block); return BEFS_ERR; } @@ -499,12 +497,11 @@ befs_find_brun_dblindirect(struct super_block *sb, } indir_block = - befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); + sb_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); if (indir_block == NULL) { befs_error(sb, "%s couldn't read the indirect block " "at blockno %lu", __func__, (unsigned long) iaddr2blockno(sb, &indir_run) + which_block); - brelse(indir_block); return BEFS_ERR; } diff --git a/fs/befs/debug.c b/fs/befs/debug.c index 4de7cffcd662..85c13392e9e8 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -169,6 +169,7 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks)); befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks)); + befs_debug(sb, " inode_size %u", fs32_to_cpu(sb, sup->inode_size)); befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); befs_debug(sb, " blocks_per_ag %u", diff --git a/fs/befs/io.c b/fs/befs/io.c index 523c8af2d770..b4a558126ee1 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -27,7 +27,7 @@ struct buffer_head * befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) { struct buffer_head *bh; - befs_blocknr_t block = 0; + befs_blocknr_t block; struct befs_sb_info *befs_sb = BEFS_SB(sb); befs_debug(sb, "---> Enter %s " @@ -59,27 +59,3 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } - -struct buffer_head * -befs_bread(struct super_block *sb, befs_blocknr_t block) -{ - struct buffer_head *bh; - - befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block); - - bh = sb_bread(sb, block); - - if (bh == NULL) { - befs_error(sb, "Failed to read block %lu", - (unsigned long)block); - goto error; - } - - befs_debug(sb, "<--- %s", __func__); - - return bh; - - error: - befs_debug(sb, "<--- %s ERROR", __func__); - return NULL; -} diff --git a/fs/befs/io.h b/fs/befs/io.h index 9b78266b6aa5..78d7bc6e60de 100644 --- a/fs/befs/io.h +++ b/fs/befs/io.h @@ -5,5 +5,3 @@ struct buffer_head *befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr); -struct buffer_head *befs_bread(struct super_block *sb, befs_blocknr_t block); - diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index bfe9f9994935..647a276eba56 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -120,7 +120,7 @@ befs_get_block(struct inode *inode, sector_t block, struct super_block *sb = inode->i_sb; befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; befs_block_run run = BAD_IADDR; - int res = 0; + int res; ulong disk_off; befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", @@ -179,15 +179,16 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) kfree(utfname); } else { - ret = befs_btree_find(sb, ds, dentry->d_name.name, &offset); + ret = befs_btree_find(sb, ds, name, &offset); } if (ret == BEFS_BT_NOT_FOUND) { befs_debug(sb, "<--- %s %pd not found", __func__, dentry); + d_add(dentry, NULL); return ERR_PTR(-ENOENT); } else if (ret != BEFS_OK || offset == 0) { - befs_warning(sb, "<--- %s Error", __func__); + befs_error(sb, "<--- %s Error", __func__); return ERR_PTR(-ENODATA); } @@ -211,56 +212,55 @@ befs_readdir(struct file *file, struct dir_context *ctx) befs_off_t value; int result; size_t keysize; - unsigned char d_type; char keybuf[BEFS_NAME_LEN + 1]; befs_debug(sb, "---> %s name %pD, inode %ld, ctx->pos %lld", __func__, file, inode->i_ino, ctx->pos); -more: - result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, - keybuf, &keysize, &value); + while (1) { + result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, + keybuf, &keysize, &value); - if (result == BEFS_ERR) { - befs_debug(sb, "<--- %s ERROR", __func__); - befs_error(sb, "IO error reading %pD (inode %lu)", - file, inode->i_ino); - return -EIO; - - } else if (result == BEFS_BT_END) { - befs_debug(sb, "<--- %s END", __func__); - return 0; - - } else if (result == BEFS_BT_EMPTY) { - befs_debug(sb, "<--- %s Empty directory", __func__); - return 0; - } + if (result == BEFS_ERR) { + befs_debug(sb, "<--- %s ERROR", __func__); + befs_error(sb, "IO error reading %pD (inode %lu)", + file, inode->i_ino); + return -EIO; - d_type = DT_UNKNOWN; + } else if (result == BEFS_BT_END) { + befs_debug(sb, "<--- %s END", __func__); + return 0; - /* Convert to NLS */ - if (BEFS_SB(sb)->nls) { - char *nlsname; - int nlsnamelen; - result = - befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); - if (result < 0) { - befs_debug(sb, "<--- %s ERROR", __func__); - return result; + } else if (result == BEFS_BT_EMPTY) { + befs_debug(sb, "<--- %s Empty directory", __func__); + return 0; } - if (!dir_emit(ctx, nlsname, nlsnamelen, - (ino_t) value, d_type)) { + + /* Convert to NLS */ + if (BEFS_SB(sb)->nls) { + char *nlsname; + int nlsnamelen; + + result = + befs_utf2nls(sb, keybuf, keysize, &nlsname, + &nlsnamelen); + if (result < 0) { + befs_debug(sb, "<--- %s ERROR", __func__); + return result; + } + if (!dir_emit(ctx, nlsname, nlsnamelen, + (ino_t) value, DT_UNKNOWN)) { + kfree(nlsname); + return 0; + } kfree(nlsname); - return 0; + } else { + if (!dir_emit(ctx, keybuf, keysize, + (ino_t) value, DT_UNKNOWN)) + return 0; } - kfree(nlsname); - } else { - if (!dir_emit(ctx, keybuf, keysize, - (ino_t) value, d_type)) - return 0; + ctx->pos++; } - ctx->pos++; - goto more; } static struct inode * @@ -299,7 +299,6 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) struct befs_sb_info *befs_sb = BEFS_SB(sb); struct befs_inode_info *befs_ino; struct inode *inode; - long ret = -EIO; befs_debug(sb, "---> %s inode = %lu", __func__, ino); @@ -318,7 +317,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) befs_ino->i_inode_num.allocation_group, befs_ino->i_inode_num.start, befs_ino->i_inode_num.len); - bh = befs_bread(sb, inode->i_ino); + bh = sb_bread(sb, inode->i_ino); if (!bh) { befs_error(sb, "unable to read inode block - " "inode = %lu", inode->i_ino); @@ -421,7 +420,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) unacquire_none: iget_failed(inode); befs_debug(sb, "<--- %s - Bad inode", __func__); - return ERR_PTR(ret); + return ERR_PTR(-EIO); } /* Initialize the inode cache. Called at fs setup. @@ -436,10 +435,9 @@ befs_init_inodecache(void) 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); - if (befs_inode_cachep == NULL) { - pr_err("%s: Couldn't initialize inode slabcache\n", __func__); + if (befs_inode_cachep == NULL) return -ENOMEM; - } + return 0; } @@ -524,8 +522,6 @@ befs_utf2nls(struct super_block *sb, const char *in, *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "%s cannot allocate memory", __func__); - *out_len = 0; return -ENOMEM; } @@ -604,7 +600,6 @@ befs_nls2utf(struct super_block *sb, const char *in, *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "%s cannot allocate memory", __func__); *out_len = 0; return -ENOMEM; } @@ -637,10 +632,6 @@ befs_nls2utf(struct super_block *sb, const char *in, return -EILSEQ; } -/** - * Use the - * - */ enum { Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, }; @@ -760,19 +751,19 @@ befs_fill_super(struct super_block *sb, void *data, int silent) long ret = -EINVAL; const unsigned long sb_block = 0; const off_t x86_sb_off = 512; + int blocksize; save_mount_options(sb, data); sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); - if (sb->s_fs_info == NULL) { - pr_err("(%s): Unable to allocate memory for private " - "portion of superblock. Bailing.\n", sb->s_id); + if (sb->s_fs_info == NULL) goto unacquire_none; - } + befs_sb = BEFS_SB(sb); if (!parse_options((char *) data, &befs_sb->mount_opts)) { - befs_error(sb, "cannot parse mount options"); + if (!silent) + befs_error(sb, "cannot parse mount options"); goto unacquire_priv_sbp; } @@ -793,10 +784,16 @@ befs_fill_super(struct super_block *sb, void *data, int silent) * least 1k to get the second 512 bytes of the volume. * -WD 10-26-01 */ - sb_min_blocksize(sb, 1024); + blocksize = sb_min_blocksize(sb, 1024); + if (!blocksize) { + if (!silent) + befs_error(sb, "unable to set blocksize"); + goto unacquire_priv_sbp; + } if (!(bh = sb_bread(sb, sb_block))) { - befs_error(sb, "unable to read superblock"); + if (!silent) + befs_error(sb, "unable to read superblock"); goto unacquire_priv_sbp; } @@ -820,9 +817,9 @@ befs_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); if( befs_sb->num_blocks > ~((sector_t)0) ) { - befs_error(sb, "blocks count: %llu " - "is larger than the host can use", - befs_sb->num_blocks); + if (!silent) + befs_error(sb, "blocks count: %llu is larger than the host can use", + befs_sb->num_blocks); goto unacquire_priv_sbp; } @@ -841,7 +838,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) } sb->s_root = d_make_root(root); if (!sb->s_root) { - befs_error(sb, "get root inode failed"); + if (!silent) + befs_error(sb, "get root inode failed"); goto unacquire_priv_sbp; } @@ -870,9 +868,9 @@ befs_fill_super(struct super_block *sb, void *data, int silent) unacquire_priv_sbp: kfree(befs_sb->mount_opts.iocharset); kfree(sb->s_fs_info); + sb->s_fs_info = NULL; unacquire_none: - sb->s_fs_info = NULL; return ret; } diff --git a/fs/befs/super.c b/fs/befs/super.c index aeafc4d84278..7c50025c99d8 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -13,24 +13,20 @@ #include "befs.h" #include "super.h" -/** - * load_befs_sb -- Read from disk and properly byteswap all the fields +/* + * befs_load_sb -- Read from disk and properly byteswap all the fields * of the befs superblock - * - * - * - * */ int -befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) +befs_load_sb(struct super_block *sb, befs_super_block *disk_sb) { struct befs_sb_info *befs_sb = BEFS_SB(sb); /* Check the byte order of the filesystem */ if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE) - befs_sb->byte_order = BEFS_BYTESEX_LE; + befs_sb->byte_order = BEFS_BYTESEX_LE; else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE) - befs_sb->byte_order = BEFS_BYTESEX_BE; + befs_sb->byte_order = BEFS_BYTESEX_BE; befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1); befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2); @@ -45,6 +41,8 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) befs_sb->ag_shift = fs32_to_cpu(sb, disk_sb->ag_shift); befs_sb->num_ags = fs32_to_cpu(sb, disk_sb->num_ags); + befs_sb->flags = fs32_to_cpu(sb, disk_sb->flags); + befs_sb->log_blocks = fsrun_to_cpu(sb, disk_sb->log_blocks); befs_sb->log_start = fs64_to_cpu(sb, disk_sb->log_start); befs_sb->log_end = fs64_to_cpu(sb, disk_sb->log_end); @@ -84,15 +82,15 @@ befs_check_sb(struct super_block *sb) } if (befs_sb->block_size > PAGE_SIZE) { - befs_error(sb, "blocksize(%u) cannot be larger" + befs_error(sb, "blocksize(%u) cannot be larger " "than system pagesize(%lu)", befs_sb->block_size, PAGE_SIZE); return BEFS_ERR; } /* - * block_shift and block_size encode the same information - * in different ways as a consistency check. + * block_shift and block_size encode the same information + * in different ways as a consistency check. */ if ((1 << befs_sb->block_shift) != befs_sb->block_size) { @@ -101,10 +99,18 @@ befs_check_sb(struct super_block *sb) return BEFS_ERR; } - if (befs_sb->log_start != befs_sb->log_end) { + + /* ag_shift also encodes the same information as blocks_per_ag in a + * different way, non-fatal consistency check + */ + if ((1 << befs_sb->ag_shift) != befs_sb->blocks_per_ag) + befs_error(sb, "ag_shift disagrees with blocks_per_ag."); + + if (befs_sb->log_start != befs_sb->log_end || + befs_sb->flags == BEFS_DIRTY) { befs_error(sb, "Filesystem not clean! There are blocks in the " - "journal. You must boot into BeOS and mount this volume " - "to make it clean."); + "journal. You must boot into BeOS and mount this " + "volume to make it clean."); return BEFS_ERR; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ccc70d96958d..d4d8b7e36b2f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -698,7 +698,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } @@ -728,7 +728,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6c21bad26a27..0b8ce2b9f7d0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -252,7 +252,8 @@ struct btrfs_super_block { #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP \ - (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE) + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e720d3e6ec20..3a57f99d96aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2586,6 +2586,7 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; int max_active; + int clear_free_space_tree = 0; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -3148,6 +3149,26 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; + if (btrfs_test_opt(fs_info, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + clear_free_space_tree = 1; + } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { + btrfs_warn(fs_info, "free space tree is invalid"); + clear_free_space_tree = 1; + } + + if (clear_free_space_tree) { + btrfs_info(fs_info, "clearing free space tree"); + ret = btrfs_clear_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); + close_ctree(tree_root); + return ret; + } + } + if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree"); @@ -3185,18 +3206,6 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) && - btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - btrfs_info(fs_info, "clearing free space tree"); - ret = btrfs_clear_free_space_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to clear free space tree: %d", ret); - close_ctree(tree_root); - return ret; - } - } - if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ee40384c394d..66a755150056 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5558,17 +5558,45 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } -/* - * The extent buffer bitmap operations are done with byte granularity because - * bitmap items are not guaranteed to be aligned to a word and therefore a - * single word in a bitmap may straddle two pages in the extent buffer. - */ -#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) -#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) -#define BITMAP_FIRST_BYTE_MASK(start) \ - ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) -#define BITMAP_LAST_BYTE_MASK(nbits) \ - (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) +void le_bitmap_set(u8 *map, unsigned int start, int len) +{ + u8 *p = map + BIT_BYTE(start); + const unsigned int size = start + len; + int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); + u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); + + while (len - bits_to_set >= 0) { + *p |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_BYTE; + mask_to_set = ~(u8)0; + p++; + } + if (len) { + mask_to_set &= BITMAP_LAST_BYTE_MASK(size); + *p |= mask_to_set; + } +} + +void le_bitmap_clear(u8 *map, unsigned int start, int len) +{ + u8 *p = map + BIT_BYTE(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); + u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_BYTE; + mask_to_clear = ~(u8)0; + p++; + } + if (len) { + mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); + *p &= ~mask_to_clear; + } +} /* * eb_bitmap_offset() - calculate the page and offset of the byte containing the @@ -5612,7 +5640,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, unsigned long nr) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; @@ -5634,13 +5662,13 @@ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; const unsigned int size = pos + len; int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); + u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; @@ -5651,7 +5679,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, kaddr[offset] |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_BYTE; - mask_to_set = ~0U; + mask_to_set = ~(u8)0; if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; @@ -5676,13 +5704,13 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; const unsigned int size = pos + len; int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); + u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; @@ -5693,7 +5721,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, kaddr[offset] &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~0U; + mask_to_clear = ~(u8)0; if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4a094f1dc7ef..ab31d145227e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -59,6 +59,28 @@ */ #define EXTENT_PAGE_PRIVATE 1 +/* + * The extent buffer bitmap operations are done with byte granularity instead of + * word granularity for two reasons: + * 1. The bitmaps must be little-endian on disk. + * 2. Bitmap items are not guaranteed to be aligned to a word and therefore a + * single word in a bitmap may straddle two pages in the extent buffer. + */ +#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) +#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BITMAP_FIRST_BYTE_MASK(start) \ + ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) +#define BITMAP_LAST_BYTE_MASK(nbits) \ + (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) + +static inline int le_test_bit(int nr, const u8 *addr) +{ + return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1))); +} + +extern void le_bitmap_set(u8 *map, unsigned int start, int len); +extern void le_bitmap_clear(u8 *map, unsigned int start, int len); + struct extent_state; struct btrfs_root; struct btrfs_io_bio; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index e4a42a8e4f84..57401b474ec6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -151,7 +151,7 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); } -static unsigned long *alloc_bitmap(u32 bitmap_size) +static u8 *alloc_bitmap(u32 bitmap_size) { void *mem; @@ -180,8 +180,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - unsigned long *bitmap; - char *bitmap_cursor; + u8 *bitmap, *bitmap_cursor; u64 start, end; u64 bitmap_range, i; u32 bitmap_size, flags, expected_extent_count; @@ -231,7 +230,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, block_group->sectorsize); last = div_u64(found_key.objectid + found_key.offset - start, block_group->sectorsize); - bitmap_set(bitmap, first, last - first); + le_bitmap_set(bitmap, first, last - first); extent_count++; nr++; @@ -270,7 +269,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, goto out; } - bitmap_cursor = (char *)bitmap; + bitmap_cursor = bitmap; bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; i = start; while (i < end) { @@ -319,7 +318,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - unsigned long *bitmap; + u8 *bitmap; u64 start, end; /* Initialize to silence GCC. */ u64 extent_start = 0; @@ -363,7 +362,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, break; } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { unsigned long ptr; - char *bitmap_cursor; + u8 *bitmap_cursor; u32 bitmap_pos, data_size; ASSERT(found_key.objectid >= start); @@ -373,7 +372,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_pos = div_u64(found_key.objectid - start, block_group->sectorsize * BITS_PER_BYTE); - bitmap_cursor = ((char *)bitmap) + bitmap_pos; + bitmap_cursor = bitmap + bitmap_pos; data_size = free_space_bitmap_size(found_key.offset, block_group->sectorsize); @@ -410,7 +409,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, offset = start; bitnr = 0; while (offset < end) { - bit = !!test_bit(bitnr, bitmap); + bit = !!le_test_bit(bitnr, bitmap); if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { @@ -1185,6 +1184,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) } btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); ret = btrfs_commit_transaction(trans, tree_root); @@ -1253,6 +1253,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) return PTR_ERR(trans); btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); fs_info->free_space_root = NULL; ret = clear_free_space_tree(trans, free_space_root); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 01bc36cec26e..71261b459863 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5805,6 +5805,64 @@ static int changed_extent(struct send_ctx *sctx, int ret = 0; if (sctx->cur_ino != sctx->cmp_key->objectid) { + + if (result == BTRFS_COMPARE_TREE_CHANGED) { + struct extent_buffer *leaf_l; + struct extent_buffer *leaf_r; + struct btrfs_file_extent_item *ei_l; + struct btrfs_file_extent_item *ei_r; + + leaf_l = sctx->left_path->nodes[0]; + leaf_r = sctx->right_path->nodes[0]; + ei_l = btrfs_item_ptr(leaf_l, + sctx->left_path->slots[0], + struct btrfs_file_extent_item); + ei_r = btrfs_item_ptr(leaf_r, + sctx->right_path->slots[0], + struct btrfs_file_extent_item); + + /* + * We may have found an extent item that has changed + * only its disk_bytenr field and the corresponding + * inode item was not updated. This case happens due to + * very specific timings during relocation when a leaf + * that contains file extent items is COWed while + * relocation is ongoing and its in the stage where it + * updates data pointers. So when this happens we can + * safely ignore it since we know it's the same extent, + * but just at different logical and physical locations + * (when an extent is fully replaced with a new one, we + * know the generation number must have changed too, + * since snapshot creation implies committing the current + * transaction, and the inode item must have been updated + * as well). + * This replacement of the disk_bytenr happens at + * relocation.c:replace_file_extents() through + * relocation.c:btrfs_reloc_cow_block(). + */ + if (btrfs_file_extent_generation(leaf_l, ei_l) == + btrfs_file_extent_generation(leaf_r, ei_r) && + btrfs_file_extent_ram_bytes(leaf_l, ei_l) == + btrfs_file_extent_ram_bytes(leaf_r, ei_r) && + btrfs_file_extent_compression(leaf_l, ei_l) == + btrfs_file_extent_compression(leaf_r, ei_r) && + btrfs_file_extent_encryption(leaf_l, ei_l) == + btrfs_file_extent_encryption(leaf_r, ei_r) && + btrfs_file_extent_other_encoding(leaf_l, ei_l) == + btrfs_file_extent_other_encoding(leaf_r, ei_r) && + btrfs_file_extent_type(leaf_l, ei_l) == + btrfs_file_extent_type(leaf_r, ei_r) && + btrfs_file_extent_disk_bytenr(leaf_l, ei_l) != + btrfs_file_extent_disk_bytenr(leaf_r, ei_r) && + btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) == + btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) && + btrfs_file_extent_offset(leaf_l, ei_l) == + btrfs_file_extent_offset(leaf_r, ei_r) && + btrfs_file_extent_num_bytes(leaf_l, ei_l) == + btrfs_file_extent_num_bytes(leaf_r, ei_r)) + return 0; + } + inconsistent_snapshot_error(sctx, result, "extent"); return -EIO; } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index d19ab0317283..caad80bb9bd0 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -273,20 +273,37 @@ out: return ret; } -/** - * test_bit_in_byte - Determine whether a bit is set in a byte - * @nr: bit number to test - * @addr: Address to start counting from - */ -static inline int test_bit_in_byte(int nr, const u8 *addr) +static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb, + unsigned long len) { - return 1UL & (addr[nr / BITS_PER_BYTE] >> (nr & (BITS_PER_BYTE - 1))); + unsigned long i; + + for (i = 0; i < len * BITS_PER_BYTE; i++) { + int bit, bit1; + + bit = !!test_bit(i, bitmap); + bit1 = !!extent_buffer_test_bit(eb, 0, i); + if (bit1 != bit) { + test_msg("Bits do not match\n"); + return -EINVAL; + } + + bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, + i % BITS_PER_BYTE); + if (bit1 != bit) { + test_msg("Offset bits do not match\n"); + return -EINVAL; + } + } + return 0; } static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, unsigned long len) { - unsigned long i, x; + unsigned long i, j; + u32 x; + int ret; memset(bitmap, 0, len); memset_extent_buffer(eb, 0, 0, len); @@ -297,16 +314,18 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, bitmap_set(bitmap, 0, len * BITS_PER_BYTE); extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Setting all bits failed\n"); - return -EINVAL; + return ret; } bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Clearing all bits failed\n"); - return -EINVAL; + return ret; } /* Straddling pages test */ @@ -316,9 +335,10 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, sizeof(long) * BITS_PER_BYTE); extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Setting straddling pages failed\n"); - return -EINVAL; + return ret; } bitmap_set(bitmap, 0, len * BITS_PER_BYTE); @@ -328,9 +348,10 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Clearing straddling pages failed\n"); - return -EINVAL; + return ret; } } @@ -339,28 +360,22 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, * something repetitive that could miss some hypothetical off-by-n bug. */ x = 0; - for (i = 0; i < len / sizeof(long); i++) { - x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL; - bitmap[i] = x; - } - write_extent_buffer(eb, bitmap, 0, len); - - for (i = 0; i < len * BITS_PER_BYTE; i++) { - int bit, bit1; - - bit = !!test_bit_in_byte(i, (u8 *)bitmap); - bit1 = !!extent_buffer_test_bit(eb, 0, i); - if (bit1 != bit) { - test_msg("Testing bit pattern failed\n"); - return -EINVAL; + bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); + for (i = 0; i < len * BITS_PER_BYTE / 32; i++) { + x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffU; + for (j = 0; j < 32; j++) { + if (x & (1U << j)) { + bitmap_set(bitmap, i * 32 + j, 1); + extent_buffer_bitmap_set(eb, 0, i * 32 + j, 1); + } } + } - bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, - i % BITS_PER_BYTE); - if (bit1 != bit) { - test_msg("Testing bit pattern with offset failed\n"); - return -EINVAL; - } + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { + test_msg("Random bit pattern failed\n"); + return ret; } return 0; diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 7508d3b42780..6e144048a72e 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -24,20 +24,15 @@ #include "../transaction.h" struct free_space_extent { - u64 start, length; + u64 start; + u64 length; }; -/* - * The test cases align their operations to this in order to hit some of the - * edge cases in the bitmap code. - */ -#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE) - static int __check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, struct btrfs_path *path, - struct free_space_extent *extents, + const struct free_space_extent * const extents, unsigned int num_extents) { struct btrfs_free_space_info *info; @@ -126,7 +121,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, struct btrfs_path *path, - struct free_space_extent *extents, + const struct free_space_extent * const extents, unsigned int num_extents) { struct btrfs_free_space_info *info; @@ -168,9 +163,10 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, static int test_empty_block_group(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { + const struct free_space_extent extents[] = { {cache->key.objectid, cache->key.offset}, }; @@ -181,9 +177,10 @@ static int test_empty_block_group(struct btrfs_trans_handle *trans, static int test_remove_all(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = {}; + const struct free_space_extent extents[] = {}; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, @@ -201,16 +198,17 @@ static int test_remove_all(struct btrfs_trans_handle *trans, static int test_remove_beginning(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid + BITMAP_RANGE, - cache->key.offset - BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid + alignment, + cache->key.offset - alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not remove free space\n"); return ret; @@ -224,17 +222,18 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans, static int test_remove_end(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, cache->key.offset - BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, cache->key.offset - alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, cache->key.objectid + - cache->key.offset - BITMAP_RANGE, - BITMAP_RANGE); + cache->key.offset - alignment, + alignment); if (ret) { test_msg("Could not remove free space\n"); return ret; @@ -247,18 +246,19 @@ static int test_remove_end(struct btrfs_trans_handle *trans, static int test_remove_middle(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, BITMAP_RANGE}, - {cache->key.objectid + 2 * BITMAP_RANGE, - cache->key.offset - 2 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, alignment}, + {cache->key.objectid + 2 * alignment, + cache->key.offset - 2 * alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not remove free space\n"); return ret; @@ -271,10 +271,11 @@ static int test_remove_middle(struct btrfs_trans_handle *trans, static int test_merge_left(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, 2 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, 2 * alignment}, }; int ret; @@ -287,15 +288,15 @@ static int test_merge_left(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -308,10 +309,11 @@ static int test_merge_left(struct btrfs_trans_handle *trans, static int test_merge_right(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid + alignment, 2 * alignment}, }; int ret; @@ -324,16 +326,16 @@ static int test_merge_right(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 2 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 2 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -346,10 +348,11 @@ static int test_merge_right(struct btrfs_trans_handle *trans, static int test_merge_both(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, 3 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, 3 * alignment}, }; int ret; @@ -362,23 +365,23 @@ static int test_merge_both(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 2 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 2 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -391,12 +394,13 @@ static int test_merge_both(struct btrfs_trans_handle *trans, static int test_merge_none(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, BITMAP_RANGE}, - {cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE}, - {cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, alignment}, + {cache->key.objectid + 2 * alignment, alignment}, + {cache->key.objectid + 4 * alignment, alignment}, }; int ret; @@ -409,23 +413,23 @@ static int test_merge_none(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 4 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 4 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 2 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 2 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -438,10 +442,11 @@ static int test_merge_none(struct btrfs_trans_handle *trans, typedef int (*test_func_t)(struct btrfs_trans_handle *, struct btrfs_fs_info *, struct btrfs_block_group_cache *, - struct btrfs_path *); + struct btrfs_path *, + u32 alignment); -static int run_test(test_func_t test_func, int bitmaps, - u32 sectorsize, u32 nodesize) +static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, + u32 nodesize, u32 alignment) { struct btrfs_fs_info *fs_info; struct btrfs_root *root = NULL; @@ -480,7 +485,7 @@ static int run_test(test_func_t test_func, int bitmaps, btrfs_set_header_nritems(root->node, 0); root->alloc_bytenr += 2 * nodesize; - cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE, sectorsize); + cache = btrfs_alloc_dummy_block_group(8 * alignment, sectorsize); if (!cache) { test_msg("Couldn't allocate dummy block group cache\n"); ret = -ENOMEM; @@ -514,7 +519,7 @@ static int run_test(test_func_t test_func, int bitmaps, } } - ret = test_func(&trans, root->fs_info, cache, path); + ret = test_func(&trans, root->fs_info, cache, path, alignment); if (ret) goto out; @@ -539,15 +544,27 @@ out: return ret; } -static int run_test_both_formats(test_func_t test_func, - u32 sectorsize, u32 nodesize) +static int run_test_both_formats(test_func_t test_func, u32 sectorsize, + u32 nodesize, u32 alignment) { + int test_ret = 0; int ret; - ret = run_test(test_func, 0, sectorsize, nodesize); - if (ret) - return ret; - return run_test(test_func, 1, sectorsize, nodesize); + ret = run_test(test_func, 0, sectorsize, nodesize, alignment); + if (ret) { + test_msg("%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u\n", + test_func, sectorsize, nodesize, alignment); + test_ret = ret; + } + + ret = run_test(test_func, 1, sectorsize, nodesize, alignment); + if (ret) { + test_msg("%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u\n", + test_func, sectorsize, nodesize, alignment); + test_ret = ret; + } + + return test_ret; } int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) @@ -563,18 +580,30 @@ int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) test_merge_both, test_merge_none, }; + u32 bitmap_alignment; + int test_ret = 0; int i; + /* + * Align some operations to a page to flush out bugs in the extent + * buffer bitmap handling of highmem. + */ + bitmap_alignment = BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE; + test_msg("Running free space tree tests\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { - int ret = run_test_both_formats(tests[i], sectorsize, - nodesize); - if (ret) { - test_msg("%pf : sectorsize %u failed\n", - tests[i], sectorsize); - return ret; - } + int ret; + + ret = run_test_both_formats(tests[i], sectorsize, nodesize, + sectorsize); + if (ret) + test_ret = ret; + + ret = run_test_both_formats(tests[i], sectorsize, nodesize, + bitmap_alignment); + if (ret) + test_ret = ret; } - return 0; + return test_ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 528cae123dc9..3d33c4e41e5f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2713,14 +2713,12 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, int index, int error) { struct btrfs_log_ctx *ctx; + struct btrfs_log_ctx *safe; - if (!error) { - INIT_LIST_HEAD(&root->log_ctxs[index]); - return; - } - - list_for_each_entry(ctx, &root->log_ctxs[index], list) + list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { + list_del_init(&ctx->list); ctx->log_ret = error; + } INIT_LIST_HEAD(&root->log_ctxs[index]); } @@ -2961,13 +2959,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); out_wake_log_root: - /* - * We needn't get log_mutex here because we are sure all - * the other tasks are blocked. - */ + mutex_lock(&log_root_tree->log_mutex); btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); - mutex_lock(&log_root_tree->log_mutex); log_root_tree->log_transid_committed++; atomic_set(&log_root_tree->log_commit[index2], 0); mutex_unlock(&log_root_tree->log_mutex); @@ -2978,10 +2972,8 @@ out_wake_log_root: if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: - /* See above. */ - btrfs_remove_all_log_ctxs(root, index1, ret); - mutex_lock(&root->log_mutex); + btrfs_remove_all_log_ctxs(root, index1, ret); root->log_transid_committed++; atomic_set(&root->log_commit[index1], 0); mutex_unlock(&root->log_mutex); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7bf08825cc11..18630e800208 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1272,7 +1272,8 @@ again: statret = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, !!page); if (statret < 0) { - __free_page(page); + if (page) + __free_page(page); if (statret == -ENODATA) { BUG_ON(retry_op != READ_INLINE); goto again; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index bca1b49c1c4b..ef4d04647325 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1511,7 +1511,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); } - if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { + if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && + !(rinfo->hash_order && req->r_path2)) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a29ffce98187..b382e5910eea 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -845,6 +845,8 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) err = ceph_fs_debugfs_init(fsc); if (err < 0) goto fail; + } else { + root = dget(fsc->sb->s_root); } fsc->mount_state = CEPH_MOUNT_MOUNTED; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 40b703217977..febc28f9e2c2 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -16,7 +16,7 @@ static int __remove_xattr(struct ceph_inode_info *ci, struct ceph_inode_xattr *xattr); -const struct xattr_handler ceph_other_xattr_handler; +static const struct xattr_handler ceph_other_xattr_handler; /* * List of handlers for synthetic system.* attributes. Other @@ -1086,7 +1086,7 @@ static int ceph_set_xattr_handler(const struct xattr_handler *handler, return __ceph_setxattr(inode, name, value, size, flags); } -const struct xattr_handler ceph_other_xattr_handler = { +static const struct xattr_handler ceph_other_xattr_handler = { .prefix = "", /* match any name => handlers called with full name */ .get = ceph_get_xattr_handler, .set = ceph_set_xattr_handler, diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 6c58e13fed2f..3d03e48a9213 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -152,6 +152,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + seq_printf(m, "\nNumber of credits: %d", server->credits); i++; list_for_each(tmp2, &server->smb_ses_list) { ses = list_entry(tmp2, struct cifs_ses, diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 1418daa03d95..07ed81cf1552 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -49,6 +49,7 @@ #define CIFS_MOUNT_USE_PREFIX_PATH 0x1000000 /* make subpath with unaccessible * root mountable */ +#define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 0065256881d8..57ff0756e30c 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -36,7 +36,15 @@ struct smb_mnt_fs_info { __u64 cifs_posix_caps; } __packed; +struct smb_snapshot_array { + __u32 number_of_snapshots; + __u32 number_of_snapshots_returned; + __u32 snapshot_array_size; + /* snapshots[]; */ +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) +#define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 71e8a56e9479..15bac390dff9 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -42,6 +42,35 @@ static const struct cifs_sid sid_authusers = { /* group users */ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; +/* S-1-22-1 Unmapped Unix users */ +static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22}, + {cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-22-2 Unmapped Unix groups */ +static const struct cifs_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22}, + {cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* + * See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx + */ + +/* S-1-5-88 MS NFS and Apple style UID/GID/mode */ + +/* S-1-5-88-1 Unix uid */ +static const struct cifs_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-5-88-2 Unix gid */ +static const struct cifs_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-5-88-3 Unix mode */ +static const struct cifs_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + static const struct cred *root_cred; static int @@ -183,6 +212,62 @@ compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) return 0; /* sids compare/match */ } +static bool +is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group) +{ + int i; + int num_subauth; + const struct cifs_sid *pwell_known_sid; + + if (!psid || (puid == NULL)) + return false; + + num_subauth = psid->num_subauth; + + /* check if Mac (or Windows NFS) vs. Samba format for Unix owner SID */ + if (num_subauth == 2) { + if (is_group) + pwell_known_sid = &sid_unix_groups; + else + pwell_known_sid = &sid_unix_users; + } else if (num_subauth == 3) { + if (is_group) + pwell_known_sid = &sid_unix_NFS_groups; + else + pwell_known_sid = &sid_unix_NFS_users; + } else + return false; + + /* compare the revision */ + if (psid->revision != pwell_known_sid->revision) + return false; + + /* compare all of the six auth values */ + for (i = 0; i < NUM_AUTHS; ++i) { + if (psid->authority[i] != pwell_known_sid->authority[i]) { + cifs_dbg(FYI, "auth %d did not match\n", i); + return false; + } + } + + if (num_subauth == 2) { + if (psid->sub_auth[0] != pwell_known_sid->sub_auth[0]) + return false; + + *puid = le32_to_cpu(psid->sub_auth[1]); + } else /* 3 subauths, ie Windows/Mac style */ { + *puid = le32_to_cpu(psid->sub_auth[0]); + if ((psid->sub_auth[0] != pwell_known_sid->sub_auth[0]) || + (psid->sub_auth[1] != pwell_known_sid->sub_auth[1])) + return false; + + *puid = le32_to_cpu(psid->sub_auth[2]); + } + + cifs_dbg(FYI, "Unix UID %d returned from SID\n", *puid); + return true; /* well known sid found, uid returned */ +} + static void cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) { @@ -276,6 +361,43 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, return -EIO; } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) { + uint32_t unix_id; + bool is_group; + + if (sidtype != SIDOWNER) + is_group = true; + else + is_group = false; + + if (is_well_known_sid(psid, &unix_id, is_group) == false) + goto try_upcall_to_get_id; + + if (is_group) { + kgid_t gid; + gid_t id; + + id = (gid_t)unix_id; + gid = make_kgid(&init_user_ns, id); + if (gid_valid(gid)) { + fgid = gid; + goto got_valid_id; + } + } else { + kuid_t uid; + uid_t id; + + id = (uid_t)unix_id; + uid = make_kuid(&init_user_ns, id); + if (uid_valid(uid)) { + fuid = uid; + goto got_valid_id; + } + } + /* If unable to find uid/gid easily from SID try via upcall */ + } + +try_upcall_to_get_id: sidstr = sid_to_key_str(psid, sidtype); if (!sidstr) return -ENOMEM; @@ -329,6 +451,7 @@ out_revert_creds: * Note that we return 0 here unconditionally. If the mapping * fails then we just fall back to using the mnt_uid/mnt_gid. */ +got_valid_id: if (sidtype == SIDOWNER) fattr->cf_uid = fuid; else diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index cca04e710421..15261ba464c5 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -64,15 +64,15 @@ unsigned int global_secflags = CIFSSEC_DEF; unsigned int sign_CIFS_PDUs = 1; static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; -module_param(CIFSMaxBufSize, uint, 0); +module_param(CIFSMaxBufSize, uint, 0444); MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " "Default: 16384 Range: 8192 to 130048"); unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; -module_param(cifs_min_rcv, uint, 0); +module_param(cifs_min_rcv, uint, 0444); MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " "1 to 64"); unsigned int cifs_min_small = 30; -module_param(cifs_min_small, uint, 0); +module_param(cifs_min_small, uint, 0444); MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; @@ -271,7 +271,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->createtime = 0; cifs_inode->epoch = 0; #ifdef CONFIG_CIFS_SMB2 - get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE); + generate_random_uuid(cifs_inode->lease_key); #endif /* * Can not set i_flags here - they get immediately overwritten to zero @@ -469,6 +469,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",posixpaths"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) seq_puts(s, ",setuids"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) + seq_puts(s, ",idsfromsid"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) seq_puts(s, ",serverino"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) @@ -1262,7 +1264,6 @@ init_cifs(void) GlobalTotalActiveXid = 0; GlobalMaxActiveXid = 0; spin_lock_init(&cifs_tcp_ses_lock); - spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); get_random_bytes(&cifs_lock_secret, sizeof(cifs_lock_secret)); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8f1d8c1e72be..1f17f6bd7a60 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -75,6 +75,18 @@ #define SMB_ECHO_INTERVAL_MAX 600 #define SMB_ECHO_INTERVAL_DEFAULT 60 +/* + * Default number of credits to keep available for SMB3. + * This value is chosen somewhat arbitrarily. The Windows client + * defaults to 128 credits, the Windows server allows clients up to + * 512 credits (or 8K for later versions), and the NetApp server + * does not limit clients at all. Choose a high enough default value + * such that the client shouldn't limit performance, but allow mount + * to override (until you approach 64K, where we limit credits to 65000 + * to reduce possibility of seeing more server credit overflow bugs. + */ +#define SMB2_MAX_CREDITS_AVAILABLE 32000 + #include "cifspdu.h" #ifndef XATTR_DOS_ATTRIB @@ -376,6 +388,8 @@ struct smb_version_operations { int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon, struct cifsFileInfo *src_file); + int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *src_file, void __user *); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); @@ -464,6 +478,7 @@ struct smb_vol { bool retry:1; bool intr:1; bool setuids:1; + bool setuidfromacl:1; bool override_uid:1; bool override_gid:1; bool dynperm:1; @@ -510,6 +525,7 @@ struct smb_vol { struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; unsigned int echo_interval; /* echo interval in secs */ + unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ }; #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ @@ -567,7 +583,8 @@ struct TCP_Server_Info { bool noblocksnd; /* use blocking sendmsg */ bool noautotune; /* do not autotune send buf sizes */ bool tcp_nodelay; - int credits; /* send no more requests at once */ + unsigned int credits; /* send no more requests at once */ + unsigned int max_credits; /* can override large 32000 default at mnt */ unsigned int in_flight; /* number of requests on the wire to server */ spinlock_t req_lock; /* protect the two values above */ struct mutex srv_mutex; @@ -833,6 +850,7 @@ struct cifs_tcon { struct list_head tcon_list; int tc_count; struct list_head openFileList; + spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ char *nativeFileSystem; @@ -889,7 +907,7 @@ struct cifs_tcon { #endif /* CONFIG_CIFS_STATS2 */ __u64 bytes_read; __u64 bytes_written; - spinlock_t stat_lock; + spinlock_t stat_lock; /* protects the two fields above */ #endif /* CONFIG_CIFS_STATS */ FILE_SYSTEM_DEVICE_INFO fsDevInfo; FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ @@ -1040,20 +1058,24 @@ struct cifs_fid_locks { }; struct cifsFileInfo { + /* following two lists are protected by tcon->open_file_lock */ struct list_head tlist; /* pointer to next fid owned by tcon */ struct list_head flist; /* next fid (file instance) for this inode */ + /* lock list below protected by cifsi->lock_sem */ struct cifs_fid_locks *llist; /* brlocks held by this fid */ kuid_t uid; /* allows finding which FileInfo structure */ __u32 pid; /* process id who opened file */ struct cifs_fid fid; /* file id from remote */ + struct list_head rlist; /* reconnect list */ /* BB add lock scope info here if needed */ ; /* lock scope id (0 if none) */ struct dentry *dentry; - unsigned int f_flags; struct tcon_link *tlink; + unsigned int f_flags; bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; - int count; /* refcount protected by cifs_file_list_lock */ + int count; + spinlock_t file_info_lock; /* protects four flag/count fields above */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ @@ -1120,7 +1142,7 @@ struct cifs_writedata { /* * Take a reference on the file private data. Must be called with - * cifs_file_list_lock held. + * cfile->file_info_lock held. */ static inline void cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) @@ -1514,8 +1536,10 @@ require use of the stronger protocol */ * GlobalMid_Lock protects: * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers - * cifs_file_list_lock protects: - * list operations on tcp and SMB session lists and tCon lists + * tcp_ses_lock protects: + * list operations on tcp and SMB session lists + * tcon->open_file_lock protects the list of open files hanging off the tcon + * cfile->file_info_lock protects counters and fields in cifs file struct * f_owner.lock protects certain per file struct operations * mapping->page_lock protects certain per page operations * @@ -1547,18 +1571,12 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; * tcp session, and the list of tcon's per smb session. It also protects * the reference counters for the server, smb session, and tcon. Finally, * changes to the tcon->tidStatus should be done while holding this lock. + * generally the locks should be taken in order tcp_ses_lock before + * tcon->open_file_lock and that before file->file_info_lock since the + * structure order is cifs_socket-->cifs_ses-->cifs_tcon-->cifs_file */ GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; -/* - * This lock protects the cifs_file->llist and cifs_file->flist - * list operations, and updates to some flags (cifs_file->invalidHandle) - * It will be moved to either use the tcon->stat_lock or equivalent later. - * If cifs_tcp_ses_lock and the lock below are both needed to be held, then - * the cifs_tcp_ses_lock must be grabbed first and released last. - */ -GLOBAL_EXTERN spinlock_t cifs_file_list_lock; - #ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ /* Outstanding dir notify requests */ GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 4ead72a001f9..ced0e42ce460 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -193,6 +193,8 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data, extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); extern void cifs_umount(struct cifs_sb_info *); extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); +extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon); + extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, __u8 type, struct cifsLockInfo **conf_lock, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f82d2823622f..3f3185febc58 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -98,13 +98,13 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) struct list_head *tmp1; /* list all files open on tree connection and mark them invalid */ - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each_safe(tmp, tmp1, &tcon->openFileList) { open_file = list_entry(tmp, struct cifsFileInfo, tlist); open_file->invalidHandle = true; open_file->oplock_break_cancelled = true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); /* * BB Add call to invalidate_inodes(sb) for all superblocks mounted * to this tcon. diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2e4f4bad8b1e..aab5227979e2 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -63,7 +63,6 @@ extern mempool_t *cifs_req_poolp; #define TLINK_IDLE_EXPIRE (600 * HZ) enum { - /* Mount options that take no arguments */ Opt_user_xattr, Opt_nouser_xattr, Opt_forceuid, Opt_noforceuid, @@ -76,7 +75,7 @@ enum { Opt_noposixpaths, Opt_nounix, Opt_nocase, Opt_brl, Opt_nobrl, - Opt_forcemandatorylock, Opt_setuids, + Opt_forcemandatorylock, Opt_setuidfromacl, Opt_setuids, Opt_nosetuids, Opt_dynperm, Opt_nodynperm, Opt_nohard, Opt_nosoft, Opt_nointr, Opt_intr, @@ -95,7 +94,7 @@ enum { Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, Opt_rsize, Opt_wsize, Opt_actimeo, - Opt_echo_interval, + Opt_echo_interval, Opt_max_credits, /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, @@ -148,6 +147,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_forcemandatorylock, "forcemand" }, { Opt_setuids, "setuids" }, { Opt_nosetuids, "nosetuids" }, + { Opt_setuidfromacl, "idsfromsid" }, { Opt_dynperm, "dynperm" }, { Opt_nodynperm, "nodynperm" }, { Opt_nohard, "nohard" }, @@ -190,6 +190,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, { Opt_echo_interval, "echo_interval=%s" }, + { Opt_max_credits, "max_credits=%s" }, { Opt_blank_user, "user=" }, { Opt_blank_user, "username=" }, @@ -1376,6 +1377,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_nosetuids: vol->setuids = 0; break; + case Opt_setuidfromacl: + vol->setuidfromacl = 1; + break; case Opt_dynperm: vol->dynperm = true; break; @@ -1586,6 +1590,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } vol->echo_interval = option; break; + case Opt_max_credits: + if (get_option_ul(args, &option) || (option < 20) || + (option > 60000)) { + cifs_dbg(VFS, "%s: Invalid max_credits value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->max_credits = option; + break; /* String Arguments */ @@ -2163,7 +2176,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, sizeof(tcp_ses->dstaddr)); #ifdef CONFIG_CIFS_SMB2 - get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE); + generate_random_uuid(tcp_ses->client_guid); #endif /* * at this point we are the only ones with the pointer @@ -3270,6 +3283,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; if (pvolume_info->setuids) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID; + if (pvolume_info->setuidfromacl) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UID_FROM_ACL; if (pvolume_info->server_ino) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; if (pvolume_info->remap) @@ -3598,7 +3613,11 @@ try_mount_again: bdi_destroy(&cifs_sb->bdi); goto out; } - + if ((volume_info->max_credits < 20) || + (volume_info->max_credits > 60000)) + server->max_credits = SMB2_MAX_CREDITS_AVAILABLE; + else + server->max_credits = volume_info->max_credits; /* get a reference to a SMB session */ ses = cifs_get_smb_ses(server, volume_info); if (IS_ERR(ses)) { @@ -3688,14 +3707,16 @@ remote_path_check: goto mount_fail_check; } - rc = cifs_are_all_path_components_accessible(server, + if (rc != -EREMOTE) { + rc = cifs_are_all_path_components_accessible(server, xid, tcon, cifs_sb, full_path); - if (rc != 0) { - cifs_dbg(VFS, "cannot query dirs between root and final path, " - "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; - rc = 0; + if (rc != 0) { + cifs_dbg(VFS, "cannot query dirs between root and final path, " + "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + rc = 0; + } } kfree(full_path); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a95fe8b1afe9..7f5f6176c6f1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -305,6 +305,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + spin_lock_init(&cfile->file_info_lock); cifs_sb_active(inode->i_sb); @@ -317,7 +318,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, oplock = 0; } - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock) oplock = fid->pending_open->oplock; list_del(&fid->pending_open->olist); @@ -326,12 +327,13 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); + /* if readable file instance put first in list*/ if (file->f_mode & FMODE_READ) list_add(&cfile->flist, &cinode->openFileList); else list_add_tail(&cfile->flist, &cinode->openFileList); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); if (fid->purge_cache) cifs_zap_mapping(inode); @@ -343,16 +345,16 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct cifsFileInfo * cifsFileInfo_get(struct cifsFileInfo *cifs_file) { - spin_lock(&cifs_file_list_lock); + spin_lock(&cifs_file->file_info_lock); cifsFileInfo_get_locked(cifs_file); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_file->file_info_lock); return cifs_file; } /* * Release a reference on the file private data. This may involve closing * the filehandle out on the server. Must be called without holding - * cifs_file_list_lock. + * tcon->open_file_lock and cifs_file->file_info_lock. */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { @@ -367,11 +369,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifs_pending_open open; bool oplock_break_cancelled; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); + + spin_lock(&cifs_file->file_info_lock); if (--cifs_file->count > 0) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_file->file_info_lock); + spin_unlock(&tcon->open_file_lock); return; } + spin_unlock(&cifs_file->file_info_lock); if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &fid); @@ -395,7 +401,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags); cifs_set_oplock_level(cifsi, 0); } - spin_unlock(&cifs_file_list_lock); + + spin_unlock(&tcon->open_file_lock); oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); @@ -732,6 +739,15 @@ reopen_success: * to the server to get the new inode info. */ + /* + * If the server returned a read oplock and we have mandatory brlocks, + * set oplock level to None. + */ + if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) { + cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n"); + oplock = 0; + } + server->ops->set_fid(cfile, &cfile->fid, oplock); if (oparms.reconnect) cifs_relock_file(cfile); @@ -753,6 +769,36 @@ int cifs_close(struct inode *inode, struct file *file) return 0; } +void +cifs_reopen_persistent_handles(struct cifs_tcon *tcon) +{ + struct cifsFileInfo *open_file; + struct list_head *tmp; + struct list_head *tmp1; + struct list_head tmp_list; + + cifs_dbg(FYI, "Reopen persistent handles"); + INIT_LIST_HEAD(&tmp_list); + + /* list all files open on tree connection, reopen resilient handles */ + spin_lock(&tcon->open_file_lock); + list_for_each(tmp, &tcon->openFileList) { + open_file = list_entry(tmp, struct cifsFileInfo, tlist); + if (!open_file->invalidHandle) + continue; + cifsFileInfo_get(open_file); + list_add_tail(&open_file->rlist, &tmp_list); + } + spin_unlock(&tcon->open_file_lock); + + list_for_each_safe(tmp, tmp1, &tmp_list) { + open_file = list_entry(tmp, struct cifsFileInfo, rlist); + cifs_reopen_file(open_file, false /* do not flush */); + list_del_init(&open_file->rlist); + cifsFileInfo_put(open_file); + } +} + int cifs_closedir(struct inode *inode, struct file *file) { int rc = 0; @@ -772,10 +818,10 @@ int cifs_closedir(struct inode *inode, struct file *file) server = tcon->ses->server; cifs_dbg(FYI, "Freeing private data in close dir\n"); - spin_lock(&cifs_file_list_lock); + spin_lock(&cfile->file_info_lock); if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (server->ops->close_dir) rc = server->ops->close_dir(xid, tcon, &cfile->fid); else @@ -784,7 +830,7 @@ int cifs_closedir(struct inode *inode, struct file *file) /* not much we can do if it fails anyway, ignore rc */ rc = 0; } else - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); buf = cfile->srch_inf.ntwrk_buf_start; if (buf) { @@ -1728,12 +1774,13 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); /* we could simply get the first_list_entry since write-only entries are always at the end of the list but since the first entry might have a close pending, we go through the whole list */ @@ -1744,8 +1791,8 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, if (!open_file->invalidHandle) { /* found a good file */ /* lock it so it will not be closed on us */ - cifsFileInfo_get_locked(open_file); - spin_unlock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&tcon->open_file_lock); return open_file; } /* else might as well continue, and look for another, or simply have the caller reopen it @@ -1753,7 +1800,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, } else /* write only file */ break; /* write only files are last so must be done */ } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return NULL; } @@ -1762,6 +1809,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file, *inv_file = NULL; struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; bool any_available = false; int rc; unsigned int refind = 0; @@ -1777,15 +1825,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, } cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); refind_writable: if (refind > MAX_REOPEN_ATT) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return NULL; } list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { @@ -1796,8 +1845,8 @@ refind_writable: if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ - cifsFileInfo_get_locked(open_file); - spin_unlock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&tcon->open_file_lock); return open_file; } else { if (!inv_file) @@ -1813,24 +1862,24 @@ refind_writable: if (inv_file) { any_available = false; - cifsFileInfo_get_locked(inv_file); + cifsFileInfo_get(inv_file); } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); if (inv_file) { rc = cifs_reopen_file(inv_file, false); if (!rc) return inv_file; else { - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_move_tail(&inv_file->flist, &cifs_inode->openFileList); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); cifsFileInfo_put(inv_file); - spin_lock(&cifs_file_list_lock); ++refind; inv_file = NULL; + spin_lock(&tcon->open_file_lock); goto refind_writable; } } @@ -3612,15 +3661,17 @@ static int cifs_readpage(struct file *file, struct page *page) static int is_inode_writable(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *open_file; + struct cifs_tcon *tcon = + cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb)); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return 1; } } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return 0; } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 7a3b84e300f8..9f51b81119f2 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -189,7 +189,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); cifs_sb = CIFS_SB(inode->i_sb); - + cifs_dbg(VFS, "cifs ioctl 0x%x\n", command); switch (command) { case FS_IOC_GETFLAGS: if (pSMBFile == NULL) @@ -267,11 +267,23 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) tcon = tlink_tcon(pSMBFile->tlink); rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); break; + case CIFS_ENUMERATE_SNAPSHOTS: + if (arg == 0) { + rc = -EINVAL; + goto cifs_ioc_exit; + } + tcon = tlink_tcon(pSMBFile->tlink); + if (tcon->ses->server->ops->enum_snapshots) + rc = tcon->ses->server->ops->enum_snapshots(xid, tcon, + pSMBFile, (void __user *)arg); + else + rc = -EOPNOTSUPP; + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; } - +cifs_ioc_exit: free_xid(xid); return rc; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 813fe13c2ae1..c6729156f9a0 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -120,6 +120,7 @@ tconInfoAlloc(void) ++ret_buf->tc_count; INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); + spin_lock_init(&ret_buf->open_file_lock); #ifdef CONFIG_CIFS_STATS spin_lock_init(&ret_buf->stat_lock); #endif @@ -465,7 +466,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) continue; cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { netfile = list_entry(tmp2, struct cifsFileInfo, tlist); @@ -495,11 +496,11 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) &netfile->oplock_break); netfile->oplock_break_cancelled = false; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No matching file for oplock break\n"); return true; @@ -613,9 +614,9 @@ backup_cred(struct cifs_sb_info *cifs_sb) void cifs_del_pending_open(struct cifs_pending_open *open) { - spin_lock(&cifs_file_list_lock); + spin_lock(&tlink_tcon(open->tlink)->open_file_lock); list_del(&open->olist); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } void @@ -635,7 +636,7 @@ void cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open) { - spin_lock(&cifs_file_list_lock); + spin_lock(&tlink_tcon(tlink)->open_file_lock); cifs_add_pending_open_locked(fid, tlink, open); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 65cf85dcda09..8f6a2a5863b9 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -597,14 +597,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) { /* close and restart search */ cifs_dbg(FYI, "search backing up - close and restart search\n"); - spin_lock(&cifs_file_list_lock); + spin_lock(&cfile->file_info_lock); if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (server->ops->close_dir) server->ops->close_dir(xid, tcon, &cfile->fid); } else - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (cfile->srch_inf.ntwrk_buf_start) { cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n"); if (cfile->srch_inf.smallBuf) diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 4f0231e685a9..1238cd3552f9 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -266,9 +266,15 @@ smb2_set_file_info(struct inode *inode, const char *full_path, struct tcon_link *tlink; int rc; + if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) && + (buf->LastWriteTime == 0) && (buf->ChangeTime) && + (buf->Attributes == 0)) + return 0; /* would be a no op, no sense sending this */ + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); + rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, SMB2_OP_SET_INFO); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 389fb9f8c84e..3d383489b9cf 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -549,19 +549,19 @@ smb2_is_valid_lease_break(char *buffer) list_for_each(tmp1, &server->smb_ses_list) { ses = list_entry(tmp1, struct cifs_ses, smb_ses_list); - spin_lock(&cifs_file_list_lock); list_for_each(tmp2, &ses->tcon_list) { tcon = list_entry(tmp2, struct cifs_tcon, tcon_list); + spin_lock(&tcon->open_file_lock); cifs_stats_inc( &tcon->stats.cifs_stats.num_oplock_brks); if (smb2_tcon_has_lease(tcon, rsp, lw)) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } + spin_unlock(&tcon->open_file_lock); } - spin_unlock(&cifs_file_list_lock); } } spin_unlock(&cifs_tcp_ses_lock); @@ -603,7 +603,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { cfile = list_entry(tmp2, struct cifsFileInfo, tlist); @@ -615,7 +615,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "file id match, oplock break\n"); cinode = CIFS_I(d_inode(cfile->dentry)); - + spin_lock(&cfile->file_info_lock); if (!CIFS_CACHE_WRITE(cinode) && rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) cfile->oplock_break_cancelled = true; @@ -637,14 +637,14 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) clear_bit( CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); - + spin_unlock(&cfile->file_info_lock); queue_work(cifsiod_wq, &cfile->oplock_break); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No matching file for oplock break\n"); return true; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d203c0329626..5d456ebb3813 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -28,6 +28,7 @@ #include "cifs_unicode.h" #include "smb2status.h" #include "smb2glob.h" +#include "cifs_ioctl.h" static int change_conf(struct TCP_Server_Info *server) @@ -70,6 +71,10 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); *val += add; + if (*val > 65000) { + *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ + printk_once(KERN_WARNING "server overflowed SMB3 credits\n"); + } server->in_flight--; if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP) rc = change_conf(server); @@ -287,7 +292,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) cifs_dbg(FYI, "Link Speed %lld\n", le64_to_cpu(out_buf->LinkSpeed)); } - + kfree(out_buf); return rc; } #endif /* STATS2 */ @@ -541,6 +546,7 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) server->ops->set_oplock_level(cinode, oplock, fid->epoch, &fid->purge_cache); cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); + memcpy(cfile->fid.create_guid, fid->create_guid, 16); } static void @@ -699,6 +705,7 @@ smb2_clone_range(const unsigned int xid, cchunk_out: kfree(pcchunk); + kfree(retbuf); return rc; } @@ -823,7 +830,6 @@ smb2_duplicate_extents(const unsigned int xid, { int rc; unsigned int ret_data_len; - char *retbuf = NULL; struct duplicate_extents_to_file dup_ext_buf; struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); @@ -849,7 +855,7 @@ smb2_duplicate_extents(const unsigned int xid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, true /* is_fsctl */, (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), - (char **)&retbuf, + NULL, &ret_data_len); if (ret_data_len > 0) @@ -872,7 +878,6 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile) { struct fsctl_set_integrity_information_req integr_info; - char *retbuf = NULL; unsigned int ret_data_len; integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED); @@ -884,9 +889,53 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, FSCTL_SET_INTEGRITY_INFORMATION, true /* is_fsctl */, (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), + NULL, + &ret_data_len); + +} + +static int +smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, void __user *ioc_buf) +{ + char *retbuf = NULL; + unsigned int ret_data_len = 0; + int rc; + struct smb_snapshot_array snapshot_in; + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_SRV_ENUMERATE_SNAPSHOTS, + true /* is_fsctl */, NULL, 0 /* no input data */, (char **)&retbuf, &ret_data_len); + cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n", + rc, ret_data_len); + if (rc) + return rc; + if (ret_data_len && (ioc_buf != NULL) && (retbuf != NULL)) { + /* Fixup buffer */ + if (copy_from_user(&snapshot_in, ioc_buf, + sizeof(struct smb_snapshot_array))) { + rc = -EFAULT; + kfree(retbuf); + return rc; + } + if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) { + rc = -ERANGE; + return rc; + } + + if (ret_data_len > snapshot_in.snapshot_array_size) + ret_data_len = snapshot_in.snapshot_array_size; + + if (copy_to_user(ioc_buf, retbuf, ret_data_len)) + rc = -EFAULT; + } + + kfree(retbuf); + return rc; } static int @@ -1041,7 +1090,7 @@ smb2_set_lease_key(struct inode *inode, struct cifs_fid *fid) static void smb2_new_lease_key(struct cifs_fid *fid) { - get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); + generate_random_uuid(fid->lease_key); } #define SMB2_SYMLINK_STRUCT_SIZE \ @@ -1654,6 +1703,7 @@ struct smb_version_operations smb21_operations = { .clone_range = smb2_clone_range, .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, + .enum_snapshots = smb3_enum_snapshots, }; struct smb_version_operations smb30_operations = { @@ -1740,6 +1790,7 @@ struct smb_version_operations smb30_operations = { .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, + .enum_snapshots = smb3_enum_snapshots, }; #ifdef CONFIG_CIFS_SMB311 @@ -1827,6 +1878,7 @@ struct smb_version_operations smb311_operations = { .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, + .enum_snapshots = smb3_enum_snapshots, }; #endif /* CIFS_SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 29e06db5f187..5ca5ea4668a1 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -100,7 +100,21 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , hdr->ProtocolId = SMB2_PROTO_NUMBER; hdr->StructureSize = cpu_to_le16(64); hdr->Command = smb2_cmd; - hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */ + if (tcon && tcon->ses && tcon->ses->server) { + struct TCP_Server_Info *server = tcon->ses->server; + + spin_lock(&server->req_lock); + /* Request up to 2 credits but don't go over the limit. */ + if (server->credits >= server->max_credits) + hdr->CreditRequest = cpu_to_le16(0); + else + hdr->CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, 2)); + spin_unlock(&server->req_lock); + } else { + hdr->CreditRequest = cpu_to_le16(2); + } hdr->ProcessId = cpu_to_le32((__u16)current->tgid); if (!tcon) @@ -236,8 +250,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) } cifs_mark_open_files_invalid(tcon); + rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); mutex_unlock(&tcon->ses->session_mutex); + + if (tcon->use_persistent) + cifs_reopen_persistent_handles(tcon); + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) goto out; @@ -574,59 +593,42 @@ vneg_out: return -EIO; } -int -SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_cp) +struct SMB2_sess_data { + unsigned int xid; + struct cifs_ses *ses; + struct nls_table *nls_cp; + void (*func)(struct SMB2_sess_data *); + int result; + u64 previous_session; + + /* we will send the SMB in three pieces: + * a fixed length beginning part, an optional + * SPNEGO blob (which can be zero length), and a + * last part which will include the strings + * and rest of bcc area. This allows us to avoid + * a large buffer 17K allocation + */ + int buf0_type; + struct kvec iov[2]; +}; + +static int +SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) { + int rc; + struct cifs_ses *ses = sess_data->ses; struct smb2_sess_setup_req *req; - struct smb2_sess_setup_rsp *rsp = NULL; - struct kvec iov[2]; - int rc = 0; - int resp_buftype = CIFS_NO_BUFFER; - __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ struct TCP_Server_Info *server = ses->server; - u16 blob_length = 0; - struct key *spnego_key = NULL; - char *security_blob = NULL; - unsigned char *ntlmssp_blob = NULL; - bool use_spnego = false; /* else use raw ntlmssp */ - - cifs_dbg(FYI, "Session Setup\n"); - - if (!server) { - WARN(1, "%s: server is NULL!\n", __func__); - return -EIO; - } - - /* - * If we are here due to reconnect, free per-smb session key - * in case signing was required. - */ - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - - /* - * If memory allocation is successful, caller of this function - * frees it. - */ - ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); - if (!ses->ntlmssp) - return -ENOMEM; - ses->ntlmssp->sesskey_per_smbsess = true; - - /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */ - if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP) - ses->sectype = RawNTLMSSP; - -ssetup_ntlmssp_authenticate: - if (phase == NtLmChallenge) - phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ rc = small_smb2_init(SMB2_SESSION_SETUP, NULL, (void **) &req); if (rc) return rc; req->hdr.SessionId = 0; /* First session, not a reauthenticate */ + + /* if reconnect, we need to send previous sess id, otherwise it is 0 */ + req->PreviousSessionId = sess_data->previous_session; + req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ req->hdr.CreditRequest = cpu_to_le16(3); @@ -642,199 +644,368 @@ ssetup_ntlmssp_authenticate: req->Capabilities = 0; req->Channel = 0; /* MBZ */ - iov[0].iov_base = (char *)req; + sess_data->iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field and 1 for pad */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + sess_data->iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* + * This variable will be used to clear the buffer + * allocated above in case of any error in the calling function. + */ + sess_data->buf0_type = CIFS_SMALL_BUFFER; - if (ses->sectype == Kerberos) { -#ifdef CONFIG_CIFS_UPCALL - struct cifs_spnego_msg *msg; + return 0; +} - spnego_key = cifs_get_spnego_key(ses); - if (IS_ERR(spnego_key)) { - rc = PTR_ERR(spnego_key); - spnego_key = NULL; - goto ssetup_exit; - } +static void +SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data) +{ + free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + sess_data->buf0_type = CIFS_NO_BUFFER; +} - msg = spnego_key->payload.data[0]; - /* - * check version field to make sure that cifs.upcall is - * sending us a response in an expected form - */ - if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cifs_dbg(VFS, - "bad cifs.upcall version. Expected %d got %d", - CIFS_SPNEGO_UPCALL_VERSION, msg->version); - rc = -EKEYREJECTED; - goto ssetup_exit; - } - ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, - GFP_KERNEL); - if (!ses->auth_key.response) { - cifs_dbg(VFS, - "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); - rc = -ENOMEM; - goto ssetup_exit; - } - ses->auth_key.len = msg->sesskey_len; - blob_length = msg->secblob_len; - iov[1].iov_base = msg->data + msg->sesskey_len; - iov[1].iov_len = blob_length; -#else - rc = -EOPNOTSUPP; - goto ssetup_exit; -#endif /* CONFIG_CIFS_UPCALL */ - } else if (phase == NtLmNegotiate) { /* if not krb5 must be ntlmssp */ - ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE), - GFP_KERNEL); - if (ntlmssp_blob == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - build_ntlmssp_negotiate_blob(ntlmssp_blob, ses); - if (use_spnego) { - /* blob_length = build_spnego_ntlmssp_blob( - &security_blob, - sizeof(struct _NEGOTIATE_MESSAGE), - ntlmssp_blob); */ - /* BB eventually need to add this */ - cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); - rc = -EOPNOTSUPP; - kfree(ntlmssp_blob); - goto ssetup_exit; - } else { - blob_length = sizeof(struct _NEGOTIATE_MESSAGE); - /* with raw NTLMSSP we don't encapsulate in SPNEGO */ - security_blob = ntlmssp_blob; - } - iov[1].iov_base = security_blob; - iov[1].iov_len = blob_length; - } else if (phase == NtLmAuthenticate) { - req->hdr.SessionId = ses->Suid; - rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, - nls_cp); - if (rc) { - cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", - rc); - goto ssetup_exit; /* BB double check error handling */ - } - if (use_spnego) { - /* blob_length = build_spnego_ntlmssp_blob( - &security_blob, - blob_length, - ntlmssp_blob); */ - cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); - rc = -EOPNOTSUPP; - kfree(ntlmssp_blob); - goto ssetup_exit; - } else { - security_blob = ntlmssp_blob; - } - iov[1].iov_base = security_blob; - iov[1].iov_len = blob_length; - } else { - cifs_dbg(VFS, "illegal ntlmssp phase\n"); - rc = -EIO; - goto ssetup_exit; - } +static int +SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) +{ + int rc; + struct smb2_sess_setup_req *req = sess_data->iov[0].iov_base; /* Testing shows that buffer offset must be at location of Buffer[0] */ req->SecurityBufferOffset = - cpu_to_le16(sizeof(struct smb2_sess_setup_req) - - 1 /* pad */ - 4 /* rfc1001 len */); - req->SecurityBufferLength = cpu_to_le16(blob_length); + cpu_to_le16(sizeof(struct smb2_sess_setup_req) - + 1 /* pad */ - 4 /* rfc1001 len */); + req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); - inc_rfc1001_len(req, blob_length - 1 /* pad */); + inc_rfc1001_len(req, sess_data->iov[1].iov_len - 1 /* pad */); /* BB add code to build os and lm fields */ - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, - CIFS_LOG_ERROR | CIFS_NEG_OP); + rc = SendReceive2(sess_data->xid, sess_data->ses, + sess_data->iov, 2, + &sess_data->buf0_type, + CIFS_LOG_ERROR | CIFS_NEG_OP); - kfree(security_blob); - rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; - ses->Suid = rsp->hdr.SessionId; - if (resp_buftype != CIFS_NO_BUFFER && - rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) { - if (phase != NtLmNegotiate) { - cifs_dbg(VFS, "Unexpected more processing error\n"); - goto ssetup_exit; - } - if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != - le16_to_cpu(rsp->SecurityBufferOffset)) { - cifs_dbg(VFS, "Invalid security buffer offset %d\n", - le16_to_cpu(rsp->SecurityBufferOffset)); - rc = -EIO; - goto ssetup_exit; + return rc; +} + +static int +SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) +{ + int rc = 0; + struct cifs_ses *ses = sess_data->ses; + + mutex_lock(&ses->server->srv_mutex); + if (ses->server->sign && ses->server->ops->generate_signingkey) { + rc = ses->server->ops->generate_signingkey(ses); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + if (rc) { + cifs_dbg(FYI, + "SMB3 session key generation failed\n"); + mutex_unlock(&ses->server->srv_mutex); + goto keygen_exit; } + } + if (!ses->server->session_estab) { + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; + } + mutex_unlock(&ses->server->srv_mutex); + + cifs_dbg(FYI, "SMB2/3 session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); - /* NTLMSSP Negotiate sent now processing challenge (response) */ - phase = NtLmChallenge; /* process ntlmssp challenge */ - rc = 0; /* MORE_PROCESSING is not an error here but expected */ - rc = decode_ntlmssp_challenge(rsp->Buffer, - le16_to_cpu(rsp->SecurityBufferLength), ses); +keygen_exit: + if (!ses->server->sign) { + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + } + return rc; +} + +#ifdef CONFIG_CIFS_UPCALL +static void +SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct cifs_spnego_msg *msg; + struct key *spnego_key = NULL; + struct smb2_sess_setup_rsp *rsp = NULL; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out; + + spnego_key = cifs_get_spnego_key(ses); + if (IS_ERR(spnego_key)) { + rc = PTR_ERR(spnego_key); + spnego_key = NULL; + goto out; } + msg = spnego_key->payload.data[0]; /* - * BB eventually add code for SPNEGO decoding of NtlmChallenge blob, - * but at least the raw NTLMSSP case works. + * check version field to make sure that cifs.upcall is + * sending us a response in an expected form */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cifs_dbg(VFS, + "bad cifs.upcall version. Expected %d got %d", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); + rc = -EKEYREJECTED; + goto out_put_spnego_key; + } + + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, + "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; + } + ses->auth_key.len = msg->sesskey_len; + + sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; + sess_data->iov[1].iov_len = msg->secblob_len; + + rc = SMB2_sess_sendreceive(sess_data); + if (rc) + goto out_put_spnego_key; + + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + ses->Suid = rsp->hdr.SessionId; + + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); + + rc = SMB2_sess_establish_session(sess_data); +out_put_spnego_key: + key_invalidate(spnego_key); + key_put(spnego_key); +out: + sess_data->result = rc; + sess_data->func = NULL; + SMB2_sess_free_buffer(sess_data); +} +#else +static void +SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) +{ + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + sess_data->result = -EOPNOTSUPP; + sess_data->func = NULL; +} +#endif + +static void +SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data); + +static void +SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct smb2_sess_setup_rsp *rsp = NULL; + char *ntlmssp_blob = NULL; + bool use_spnego = false; /* else use raw ntlmssp */ + u16 blob_length = 0; + /* - * No tcon so can't do - * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); + * If memory allocation is successful, caller of this function + * frees it. */ - if (rc != 0) - goto ssetup_exit; + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) { + rc = -ENOMEM; + goto out_err; + } + ses->ntlmssp->sesskey_per_smbsess = true; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out_err; + + ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE), + GFP_KERNEL); + if (ntlmssp_blob == NULL) { + rc = -ENOMEM; + goto out; + } + + build_ntlmssp_negotiate_blob(ntlmssp_blob, ses); + if (use_spnego) { + /* BB eventually need to add this */ + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); + rc = -EOPNOTSUPP; + goto out; + } else { + blob_length = sizeof(struct _NEGOTIATE_MESSAGE); + /* with raw NTLMSSP we don't encapsulate in SPNEGO */ + } + sess_data->iov[1].iov_base = ntlmssp_blob; + sess_data->iov[1].iov_len = blob_length; + + rc = SMB2_sess_sendreceive(sess_data); + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + + /* If true, rc here is expected and not an error */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && + rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) + rc = 0; + + if (rc) + goto out; + + if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != + le16_to_cpu(rsp->SecurityBufferOffset)) { + cifs_dbg(VFS, "Invalid security buffer offset %d\n", + le16_to_cpu(rsp->SecurityBufferOffset)); + rc = -EIO; + goto out; + } + rc = decode_ntlmssp_challenge(rsp->Buffer, + le16_to_cpu(rsp->SecurityBufferLength), ses); + if (rc) + goto out; + + cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + + ses->Suid = rsp->hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); -ssetup_exit: - free_rsp_buf(resp_buftype, rsp); - - /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ - if ((phase == NtLmChallenge) && (rc == 0)) - goto ssetup_ntlmssp_authenticate; +out: + kfree(ntlmssp_blob); + SMB2_sess_free_buffer(sess_data); if (!rc) { - mutex_lock(&server->srv_mutex); - if (server->sign && server->ops->generate_signingkey) { - rc = server->ops->generate_signingkey(ses); - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - if (rc) { - cifs_dbg(FYI, - "SMB3 session key generation failed\n"); - mutex_unlock(&server->srv_mutex); - goto keygen_exit; - } - } - if (!server->session_estab) { - server->sequence_number = 0x2; - server->session_estab = true; - } - mutex_unlock(&server->srv_mutex); - - cifs_dbg(FYI, "SMB2/3 session established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); + sess_data->result = 0; + sess_data->func = SMB2_sess_auth_rawntlmssp_authenticate; + return; } +out_err: + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + sess_data->result = rc; + sess_data->func = NULL; +} -keygen_exit: - if (!server->sign) { - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; +static void +SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct smb2_sess_setup_req *req; + struct smb2_sess_setup_rsp *rsp = NULL; + unsigned char *ntlmssp_blob = NULL; + bool use_spnego = false; /* else use raw ntlmssp */ + u16 blob_length = 0; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out; + + req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; + req->hdr.SessionId = ses->Suid; + + rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, + sess_data->nls_cp); + if (rc) { + cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", rc); + goto out; } - if (spnego_key) { - key_invalidate(spnego_key); - key_put(spnego_key); + + if (use_spnego) { + /* BB eventually need to add this */ + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); + rc = -EOPNOTSUPP; + goto out; } + sess_data->iov[1].iov_base = ntlmssp_blob; + sess_data->iov[1].iov_len = blob_length; + + rc = SMB2_sess_sendreceive(sess_data); + if (rc) + goto out; + + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + + ses->Suid = rsp->hdr.SessionId; + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); + + rc = SMB2_sess_establish_session(sess_data); +out: + kfree(ntlmssp_blob); + SMB2_sess_free_buffer(sess_data); kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + sess_data->result = rc; + sess_data->func = NULL; +} +static int +SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) +{ + if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP) + ses->sectype = RawNTLMSSP; + + switch (ses->sectype) { + case Kerberos: + sess_data->func = SMB2_auth_kerberos; + break; + case RawNTLMSSP: + sess_data->func = SMB2_sess_auth_rawntlmssp_negotiate; + break; + default: + cifs_dbg(VFS, "secType %d not supported!\n", ses->sectype); + return -EOPNOTSUPP; + } + + return 0; +} + +int +SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct TCP_Server_Info *server = ses->server; + struct SMB2_sess_data *sess_data; + + cifs_dbg(FYI, "Session Setup\n"); + + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; + } + + sess_data = kzalloc(sizeof(struct SMB2_sess_data), GFP_KERNEL); + if (!sess_data) + return -ENOMEM; + + rc = SMB2_select_sec(ses, sess_data); + if (rc) + goto out; + sess_data->xid = xid; + sess_data->ses = ses; + sess_data->buf0_type = CIFS_NO_BUFFER; + sess_data->nls_cp = (struct nls_table *) nls_cp; + + while (sess_data->func) + sess_data->func(sess_data); + + rc = sess_data->result; +out: + kfree(sess_data); return rc; } @@ -1164,7 +1335,7 @@ create_durable_v2_buf(struct cifs_fid *pfid) buf->dcontext.Timeout = 0; /* Should this be configurable by workload */ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); - get_random_bytes(buf->dcontext.CreateGuid, 16); + generate_random_uuid(buf->dcontext.CreateGuid); memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */ @@ -2057,6 +2228,7 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rdata->credits) { buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); + buf->CreditRequest = buf->CreditCharge; spin_lock(&server->req_lock); server->credits += rdata->credits - le16_to_cpu(buf->CreditCharge); @@ -2243,6 +2415,7 @@ smb2_async_writev(struct cifs_writedata *wdata, if (wdata->credits) { req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); + req->hdr.CreditRequest = req->hdr.CreditCharge; spin_lock(&server->req_lock); server->credits += wdata->credits - le16_to_cpu(req->hdr.CreditCharge); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index ff88d9feb01e..fd3709e8de33 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -276,7 +276,7 @@ struct smb2_sess_setup_req { __le32 Channel; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __le64 PreviousSessionId; + __u64 PreviousSessionId; __u8 Buffer[1]; /* variable length GSS security buffer */ } __packed; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 5e23f64c0804..20af5187ba63 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -33,7 +33,8 @@ #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" - +#define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */ +#define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */ /* BB need to add server (Samba e.g) support for security and trusted prefix */ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT }; @@ -144,6 +145,54 @@ out: return rc; } +static int cifs_attrib_get(struct dentry *dentry, + struct inode *inode, void *value, + size_t size) +{ + ssize_t rc; + __u32 *pattribute; + + rc = cifs_revalidate_dentry_attr(dentry); + + if (rc) + return rc; + + if ((value == NULL) || (size == 0)) + return sizeof(__u32); + else if (size < sizeof(__u32)) + return -ERANGE; + + /* return dos attributes as pseudo xattr */ + pattribute = (__u32 *)value; + *pattribute = CIFS_I(inode)->cifsAttrs; + + return sizeof(__u32); +} + +static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode, + void *value, size_t size) +{ + ssize_t rc; + __u64 * pcreatetime; + + rc = cifs_revalidate_dentry_attr(dentry); + if (rc) + return rc; + + if ((value == NULL) || (size == 0)) + return sizeof(__u64); + else if (size < sizeof(__u64)) + return -ERANGE; + + /* return dos attributes as pseudo xattr */ + pcreatetime = (__u64 *)value; + *pcreatetime = CIFS_I(inode)->createtime; + return sizeof(__u64); + + return rc; +} + + static int cifs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) @@ -168,10 +217,19 @@ static int cifs_xattr_get(const struct xattr_handler *handler, rc = -ENOMEM; goto out; } - /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ switch (handler->flags) { case XATTR_USER: + cifs_dbg(FYI, "%s:querying user xattr %s\n", __func__, name); + if (strcmp(name, CIFS_XATTR_ATTRIB) == 0) { + rc = cifs_attrib_get(dentry, inode, value, size); + break; + } else if (strcmp(name, CIFS_XATTR_CREATETIME) == 0) { + rc = cifs_creation_time_get(dentry, inode, value, size); + break; + } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto out; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 61057b7dbddb..98f87fe8f186 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -151,7 +151,10 @@ static int do_page_crypto(struct inode *inode, struct page *src_page, struct page *dest_page, gfp_t gfp_flags) { - u8 xts_tweak[FS_XTS_TWEAK_SIZE]; + struct { + __le64 index; + u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; + } xts_tweak; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -171,17 +174,15 @@ static int do_page_crypto(struct inode *inode, req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, page_crypt_complete, &ecr); - BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - FS_XTS_TWEAK_SIZE - sizeof(index)); + BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); + xts_tweak.index = cpu_to_le64(index); + memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, PAGE_SIZE, 0); sg_init_table(&src, 1); sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, - xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index ed115acb5dee..6865663aac69 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -109,6 +109,8 @@ int fscrypt_process_policy(struct file *filp, if (ret) return ret; + inode_lock(inode); + if (!inode_has_encryption_context(inode)) { if (!S_ISDIR(inode->i_mode)) ret = -EINVAL; @@ -127,6 +129,8 @@ int fscrypt_process_policy(struct file *filp, ret = -EINVAL; } + inode_unlock(inode); + mnt_drop_write_file(filp); return ret; } diff --git a/fs/exec.c b/fs/exec.c index 6fcfb3f7b137..4e497b9ee71e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; + unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -199,12 +200,16 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; } #endif + + if (write) + gup_flags |= FOLL_WRITE; + /* * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ - ret = get_user_pages_remote(current, bprm->mm, pos, 1, write, - 1, &page, NULL); + ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, + &page, NULL); if (ret <= 0) return NULL; diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 79101651fe9e..42f9a0a0c4ca 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -137,7 +137,7 @@ Espan: bad_entry: EXOFS_ERR( "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - " - "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n", + "offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n", dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, _LLU(le64_to_cpu(p->inode_no)), rec_len, p->name_len); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index d831e24dc885..41b8b44a391c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -622,7 +622,7 @@ static int ext2_get_blocks(struct inode *inode, u32 *bno, bool *new, bool *boundary, int create) { - int err = -EIO; + int err; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -639,7 +639,7 @@ static int ext2_get_blocks(struct inode *inode, depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary); if (depth == 0) - return (err); + return -EIO; partial = ext2_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ @@ -761,7 +761,6 @@ static int ext2_get_blocks(struct inode *inode, ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); got_it: - *bno = le32_to_cpu(chain[depth-1].key); if (count > blocks_to_boundary) *boundary = true; err = count; @@ -772,6 +771,8 @@ cleanup: brelse(partial->bh); partial--; } + if (err > 0) + *bno = le32_to_cpu(chain[depth-1].key); return err; } diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 02ddec6d8a7d..fdb19543af1e 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -128,12 +128,12 @@ static void debug_print_tree(struct ext4_sb_info *sbi) node = rb_first(&sbi->system_blks); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); - printk("%s%llu-%llu", first ? "" : ", ", + printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", entry->start_blk, entry->start_blk + entry->count - 1); first = 0; node = rb_next(node); } - printk("\n"); + printk(KERN_CONT "\n"); } int ext4_setup_system_zone(struct super_block *sb) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 3ef1df6ae9ec..1aba469f8220 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -27,16 +27,15 @@ #ifdef CONFIG_EXT4_DEBUG extern ushort ext4_mballoc_debug; -#define mb_debug(n, fmt, a...) \ - do { \ - if ((n) <= ext4_mballoc_debug) { \ - printk(KERN_DEBUG "(%s, %d): %s: ", \ - __FILE__, __LINE__, __func__); \ - printk(fmt, ## a); \ - } \ - } while (0) +#define mb_debug(n, fmt, ...) \ +do { \ + if ((n) <= ext4_mballoc_debug) { \ + printk(KERN_DEBUG "(%s, %d): %s: " fmt, \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ + } \ +} while (0) #else -#define mb_debug(n, fmt, a...) no_printk(fmt, ## a) +#define mb_debug(n, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f92f10d4f66a..104f8bfba718 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -577,12 +577,13 @@ static inline unsigned dx_node_limit(struct inode *dir) static void dx_show_index(char * label, struct dx_entry *entries) { int i, n = dx_get_count (entries); - printk(KERN_DEBUG "%s index ", label); + printk(KERN_DEBUG "%s index", label); for (i = 0; i < n; i++) { - printk("%x->%lu ", i ? dx_get_hash(entries + i) : - 0, (unsigned long)dx_get_block(entries + i)); + printk(KERN_CONT " %x->%lu", + i ? dx_get_hash(entries + i) : 0, + (unsigned long)dx_get_block(entries + i)); } - printk("\n"); + printk(KERN_CONT "\n"); } struct stats @@ -679,7 +680,7 @@ static struct stats dx_show_leaf(struct inode *dir, } de = ext4_next_entry(de, size); } - printk("(%i)\n", names); + printk(KERN_CONT "(%i)\n", names); return (struct stats) { names, space, 1 }; } @@ -798,7 +799,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, q = entries + count - 1; while (p <= q) { m = p + (q - p) / 2; - dxtrace(printk(".")); + dxtrace(printk(KERN_CONT ".")); if (dx_get_hash(m) > hash) q = m - 1; else @@ -810,7 +811,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, at = entries; while (n--) { - dxtrace(printk(",")); + dxtrace(printk(KERN_CONT ",")); if (dx_get_hash(++at) > hash) { at--; @@ -821,7 +822,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, } at = p - 1; - dxtrace(printk(" %x->%u\n", at == entries ? 0 : dx_get_hash(at), + dxtrace(printk(KERN_CONT " %x->%u\n", + at == entries ? 0 : dx_get_hash(at), dx_get_block(at))); frame->entries = entries; frame->at = at; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6db81fbcbaa6..20da99da0a34 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -597,14 +597,15 @@ void __ext4_std_error(struct super_block *sb, const char *function, void __ext4_abort(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { + struct va_format vaf; va_list args; save_error_info(sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id, - function, line); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n", + sb->s_id, function, line, &vaf); va_end(args); if ((sb->s_flags & MS_RDONLY) == 0) { @@ -2715,12 +2716,12 @@ static void print_daily_error_info(unsigned long arg) es->s_first_error_func, le32_to_cpu(es->s_first_error_line)); if (es->s_first_error_ino) - printk(": inode %u", + printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_first_error_ino)); if (es->s_first_error_block) - printk(": block %llu", (unsigned long long) + printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_first_error_block)); - printk("\n"); + printk(KERN_CONT "\n"); } if (es->s_last_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", @@ -2729,12 +2730,12 @@ static void print_daily_error_info(unsigned long arg) es->s_last_error_func, le32_to_cpu(es->s_last_error_line)); if (es->s_last_error_ino) - printk(": inode %u", + printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_last_error_ino)); if (es->s_last_error_block) - printk(": block %llu", (unsigned long long) + printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_last_error_block)); - printk("\n"); + printk(KERN_CONT "\n"); } mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 73bcfd41f5f2..42145be5c6b4 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -223,14 +223,18 @@ static struct attribute *ext4_attrs[] = { EXT4_ATTR_FEATURE(lazy_itable_init); EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); +#ifdef CONFIG_EXT4_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); +#endif EXT4_ATTR_FEATURE(metadata_csum_seed); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), ATTR_LIST(meta_bg_resize), +#ifdef CONFIG_EXT4_FS_ENCRYPTION ATTR_LIST(encryption), +#endif ATTR_LIST(metadata_csum_seed), NULL, }; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c15d63389957..d77be9e9f535 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -61,18 +61,12 @@ #include "acl.h" #ifdef EXT4_XATTR_DEBUG -# define ea_idebug(inode, f...) do { \ - printk(KERN_DEBUG "inode %s:%lu: ", \ - inode->i_sb->s_id, inode->i_ino); \ - printk(f); \ - printk("\n"); \ - } while (0) -# define ea_bdebug(bh, f...) do { \ - printk(KERN_DEBUG "block %pg:%lu: ", \ - bh->b_bdev, (unsigned long) bh->b_blocknr); \ - printk(f); \ - printk("\n"); \ - } while (0) +# define ea_idebug(inode, fmt, ...) \ + printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ + inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...) \ + printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \ + bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__) #else # define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) @@ -241,7 +235,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, int error = -EFSCORRUPTED; if (((void *) header >= end) || - (header->h_magic != le32_to_cpu(EXT4_XATTR_MAGIC))) + (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; error = ext4_xattr_check_names(entry, end, entry); errout: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 93985c64d8a8..6f14ee923acd 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -852,16 +852,16 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, for (segno = start_segno; segno < end_segno; segno++) { - if (get_valid_blocks(sbi, segno, 1) == 0 || - unlikely(f2fs_cp_error(sbi))) - goto next; - /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - f2fs_bug_on(sbi, !PageUptodate(sum_page)); f2fs_put_page(sum_page, 0); + if (get_valid_blocks(sbi, segno, 1) == 0 || + !PageUptodate(sum_page) || + unlikely(f2fs_cp_error(sbi))) + goto next; + sum = page_address(sum_page); f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); diff --git a/fs/iomap.c b/fs/iomap.c index 013d1d36fbbf..a8ee8c33ca78 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -433,8 +433,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, struct page *page = data; int ret; - ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length, - NULL, iomap); + ret = __block_write_begin_int(page, pos, length, NULL, iomap); if (ret) return ret; @@ -561,7 +560,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } while (len > 0) { - ret = iomap_apply(inode, start, len, 0, ops, &ctx, + ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, iomap_fiemap_actor); /* inode with no (attribute) mapping will give ENOENT */ if (ret == -ENOENT) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index ad0c745ebad7..871c8b392099 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -687,6 +687,11 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) pri_bh = NULL; root_found: + /* We don't support read-write mounts */ + if (!(s->s_flags & MS_RDONLY)) { + error = -EACCES; + goto out_freebh; + } if (joliet_level && (pri == NULL || !opt.rock)) { /* This is the case of Joliet with the norock mount flag. @@ -1501,9 +1506,6 @@ struct inode *__isofs_iget(struct super_block *sb, static struct dentry *isofs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - /* We don't support read-write mounts */ - if (!(flags & MS_RDONLY)) - return ERR_PTR(-EACCES); return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3d8246a9faa4..e1652665bd93 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1149,6 +1149,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction == journal->j_committing_transaction) { /* first access by this transaction */ jh->b_modified = 0; @@ -1156,8 +1157,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "set next transaction"); spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; + spin_unlock(&journal->j_list_lock); } - spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); /* diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index dcd96aac02f5..cf4c636ff4da 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -110,8 +110,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * - * return value: length of the string. If greater than buflen, - * then contents of buf are undefined. On error, -1 is returned. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, @@ -119,9 +120,8 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; - size_t depth_from, depth_to, len = 0, nlen = 0; - char *p; - int i; + size_t depth_from, depth_to, len = 0; + int i, j; if (!kn_from) kn_from = kernfs_root(kn_to)->kn; @@ -131,7 +131,7 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) - return -1; + return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); @@ -144,22 +144,16 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, len < buflen ? buflen - len : 0); /* Calculate how many bytes we need for the rest */ - for (kn = kn_to; kn != common; kn = kn->parent) - nlen += strlen(kn->name) + 1; - - if (len + nlen >= buflen) - return len + nlen; - - p = buf + len + nlen; - *p = '\0'; - for (kn = kn_to; kn != common; kn = kn->parent) { - size_t tmp = strlen(kn->name); - p -= tmp; - memcpy(p, kn->name, tmp); - *(--p) = '/'; + for (i = depth_to - 1; i >= 0; i--) { + for (kn = kn_to, j = 0; j < i; j++) + kn = kn->parent; + len += strlcpy(buf + len, "/", + len < buflen ? buflen - len : 0); + len += strlcpy(buf + len, kn->name, + len < buflen ? buflen - len : 0); } - return len + nlen; + return len; } /** @@ -186,29 +180,6 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) } /** - * kernfs_path_len - determine the length of the full path of a given node - * @kn: kernfs_node of interest - * - * The returned length doesn't include the space for the terminating '\0'. - */ -size_t kernfs_path_len(struct kernfs_node *kn) -{ - size_t len = 0; - unsigned long flags; - - spin_lock_irqsave(&kernfs_rename_lock, flags); - - do { - len += strlen(kn->name) + 1; - kn = kn->parent; - } while (kn && kn->parent); - - spin_unlock_irqrestore(&kernfs_rename_lock, flags); - - return len; -} - -/** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest @@ -220,8 +191,9 @@ size_t kernfs_path_len(struct kernfs_node *kn) * path (which includes '..'s) as needed to reach from @from to @to is * returned. * - * If @buf isn't long enough, the return value will be greater than @buflen - * and @buf contents are undefined. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) @@ -237,28 +209,6 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** - * kernfs_path - build full path of a given node - * @kn: kernfs_node of interest - * @buf: buffer to copy @kn's name into - * @buflen: size of @buf - * - * Builds and returns the full path of @kn in @buf of @buflen bytes. The - * path is built from the end of @buf so the returned pointer usually - * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated - * and %NULL is returned. - */ -char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) -{ - int ret; - - ret = kernfs_path_from_node(kn, NULL, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; -} -EXPORT_SYMBOL_GPL(kernfs_path); - -/** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 2bcb86e6e6ca..78219d5644e9 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -911,6 +911,7 @@ const struct file_operations kernfs_file_fops = { .open = kernfs_fop_open, .release = kernfs_fop_release, .poll = kernfs_fop_poll, + .fsync = noop_fsync, }; /** diff --git a/fs/locks.c b/fs/locks.c index ce93b416b490..22c5b4aa4961 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1609,6 +1609,7 @@ int fcntl_getlease(struct file *filp) ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { @@ -1618,6 +1619,8 @@ int fcntl_getlease(struct file *filp) break; } spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); } return type; @@ -2529,11 +2532,14 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx) if (list_empty(&ctx->flc_lease)) return; + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) if (filp == fl->fl_file) lease_modify(fl, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); } diff --git a/fs/namei.c b/fs/namei.c index a7f601cd521a..5b4eed221530 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4668,6 +4668,31 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) } EXPORT_SYMBOL(generic_readlink); +/** + * vfs_get_link - get symlink body + * @dentry: dentry on which to get symbolic link + * @done: caller needs to free returned data with this + * + * Calls security hook and i_op->get_link() on the supplied inode. + * + * It does not touch atime. That's up to the caller if necessary. + * + * Does not work on "special" symlinks like /proc/$$/fd/N + */ +const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) +{ + const char *res = ERR_PTR(-EINVAL); + struct inode *inode = d_inode(dentry); + + if (d_is_symlink(dentry)) { + res = ERR_PTR(security_inode_readlink(dentry)); + if (!res) + res = inode->i_op->get_link(dentry, inode, done); + } + return res; +} +EXPORT_SYMBOL(vfs_get_link); + /* get the link contents into pagecache */ const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) diff --git a/fs/namespace.c b/fs/namespace.c index 58aca9c931ac..e6c234b1a645 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2824,6 +2824,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) return new_ns; } +__latent_entropy struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 217847679f0e..2905479f214a 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -344,9 +344,10 @@ static void bl_write_cleanup(struct work_struct *work) u64 start = hdr->args.offset & (loff_t)PAGE_MASK; u64 end = (hdr->args.offset + hdr->args.count + PAGE_SIZE - 1) & (loff_t)PAGE_MASK; + u64 lwb = hdr->args.offset + hdr->args.count; ext_tree_mark_written(bl, start >> SECTOR_SHIFT, - (end - start) >> SECTOR_SHIFT, end); + (end - start) >> SECTOR_SHIFT, lwb); } pnfs_ld_write_done(hdr); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ad917bd72b38..7897826d7c51 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1545,7 +1545,7 @@ static int update_open_stateid(struct nfs4_state *state, struct nfs_client *clp = server->nfs_client; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *deleg_cur; - nfs4_stateid freeme = {0}; + nfs4_stateid freeme = { }; int ret = 0; fmode &= (FMODE_READ|FMODE_WRITE); diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c index 1e8fe844e69f..5355efba4bc8 100644 --- a/fs/orangefs/dcache.c +++ b/fs/orangefs/dcache.c @@ -73,7 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) } } - dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ret = 1; out_release_op: op_release(new_op); @@ -94,8 +94,9 @@ out_drop: static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags) { int ret; + unsigned long time = (unsigned long) dentry->d_fsdata; - if (time_before(jiffies, dentry->d_time)) + if (time_before(jiffies, time)) return 1; if (flags & LOOKUP_RCU) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 66ea0cc37b18..02cc6139ec90 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -621,9 +621,9 @@ static int orangefs_file_release(struct inode *inode, struct file *file) * readahead cache (if any); this forces an expensive refresh of * data for the next caller of mmap (or 'get_block' accesses) */ - if (file->f_path.dentry->d_inode && - file->f_path.dentry->d_inode->i_mapping && - mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) { + if (file_inode(file) && + file_inode(file)->i_mapping && + mapping_nrpages(&file_inode(file)->i_data)) { if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { gossip_debug(GOSSIP_INODE_DEBUG, "calling flush_racache on %pU\n", @@ -632,7 +632,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file) gossip_debug(GOSSIP_INODE_DEBUG, "flush_racache finished\n"); } - truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping, + truncate_inode_pages(file_inode(file)->i_mapping, 0); } return 0; @@ -648,7 +648,7 @@ static int orangefs_fsync(struct file *file, { int ret = -EINVAL; struct orangefs_inode_s *orangefs_inode = - ORANGEFS_I(file->f_path.dentry->d_inode); + ORANGEFS_I(file_inode(file)); struct orangefs_kernel_op_s *new_op = NULL; /* required call */ @@ -661,7 +661,7 @@ static int orangefs_fsync(struct file *file, ret = service_operation(new_op, "orangefs_fsync", - get_interruptible_flag(file->f_path.dentry->d_inode)); + get_interruptible_flag(file_inode(file))); gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_fsync got return value of %d\n", @@ -669,7 +669,7 @@ static int orangefs_fsync(struct file *file, op_release(new_op); - orangefs_flush_inode(file->f_path.dentry->d_inode); + orangefs_flush_inode(file_inode(file)); return ret; } diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index d15d3d2dba62..a290ff6ec756 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -72,7 +72,7 @@ static int orangefs_create(struct inode *dir, d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, @@ -183,7 +183,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); if (IS_ERR(inode)) { @@ -322,7 +322,7 @@ static int orangefs_symlink(struct inode *dir, d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, @@ -386,7 +386,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 0a82048f3aaf..3bf803d732c5 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -580,4 +580,11 @@ static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size) #endif } +static inline void orangefs_set_timeout(struct dentry *dentry) +{ + unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + + dentry->d_fsdata = (void *) time; +} + #endif /* __ORANGEFSKERNEL_H */ diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 3f803b3a1f82..aeb60f791418 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -57,6 +57,7 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) ssize_t list_size, size, value_size = 0; char *buf, *name, *value = NULL; int uninitialized_var(error); + size_t slen; if (!(old->d_inode->i_opflags & IOP_XATTR) || !(new->d_inode->i_opflags & IOP_XATTR)) @@ -79,7 +80,16 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) goto out; } - for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + for (name = buf; list_size; name += slen) { + slen = strnlen(name, list_size) + 1; + + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > list_size)) { + error = -EIO; + break; + } + list_size -= slen; + if (ovl_is_private_xattr(name)) continue; retry: @@ -174,40 +184,6 @@ out_fput: return error; } -static char *ovl_read_symlink(struct dentry *realdentry) -{ - int res; - char *buf; - struct inode *inode = realdentry->d_inode; - mm_segment_t old_fs; - - res = -EINVAL; - if (!inode->i_op->readlink) - goto err; - - res = -ENOMEM; - buf = (char *) __get_free_page(GFP_KERNEL); - if (!buf) - goto err; - - old_fs = get_fs(); - set_fs(get_ds()); - /* The cast to a user pointer is valid due to the set_fs() */ - res = inode->i_op->readlink(realdentry, - (char __user *)buf, PAGE_SIZE - 1); - set_fs(old_fs); - if (res < 0) { - free_page((unsigned long) buf); - goto err; - } - buf[res] = '\0'; - - return buf; - -err: - return ERR_PTR(res); -} - static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) { struct iattr attr = { @@ -354,19 +330,20 @@ out_cleanup: int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct path *lowerpath, struct kstat *stat) { + DEFINE_DELAYED_CALL(done); struct dentry *workdir = ovl_workdir(dentry); int err; struct kstat pstat; struct path parentpath; + struct dentry *lowerdentry = lowerpath->dentry; struct dentry *upperdir; struct dentry *upperdentry; - const struct cred *old_cred; - char *link = NULL; + const char *link = NULL; if (WARN_ON(!workdir)) return -EROFS; - ovl_do_check_copy_up(lowerpath->dentry); + ovl_do_check_copy_up(lowerdentry); ovl_path_upper(parent, &parentpath); upperdir = parentpath.dentry; @@ -376,13 +353,11 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return err; if (S_ISLNK(stat->mode)) { - link = ovl_read_symlink(lowerpath->dentry); + link = vfs_get_link(lowerdentry, &done); if (IS_ERR(link)) return PTR_ERR(link); } - old_cred = ovl_override_creds(dentry->d_sb); - err = -EIO; if (lock_rename(workdir, upperdir) != NULL) { pr_err("overlayfs: failed to lock workdir+upperdir\n"); @@ -403,19 +378,16 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, } out_unlock: unlock_rename(workdir, upperdir); - revert_creds(old_cred); - - if (link) - free_page((unsigned long) link); + do_delayed_call(&done); return err; } int ovl_copy_up(struct dentry *dentry) { - int err; + int err = 0; + const struct cred *old_cred = ovl_override_creds(dentry->d_sb); - err = 0; while (!err) { struct dentry *next; struct dentry *parent; @@ -447,6 +419,7 @@ int ovl_copy_up(struct dentry *dentry) dput(parent); dput(next); } + revert_creds(old_cred); return err; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 5f90ddf778ba..306b6c161840 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -14,6 +14,7 @@ #include <linux/cred.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> +#include <linux/atomic.h> #include "overlayfs.h" void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) @@ -37,8 +38,10 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) { struct dentry *temp; char name[20]; + static atomic_t temp_id = ATOMIC_INIT(0); - snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry); + /* counter is allowed to wrap, since temp dentries are ephemeral */ + snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); temp = lookup_one_len(name, workdir, strlen(name)); if (!IS_ERR(temp) && temp->d_inode) { diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index c18d6a4ff456..c58f01babf30 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -19,6 +19,7 @@ static int ovl_copy_up_truncate(struct dentry *dentry) struct dentry *parent; struct kstat stat; struct path lowerpath; + const struct cred *old_cred; parent = dget_parent(dentry); err = ovl_copy_up(parent); @@ -26,12 +27,14 @@ static int ovl_copy_up_truncate(struct dentry *dentry) goto out_dput_parent; ovl_path_lower(dentry, &lowerpath); - err = vfs_getattr(&lowerpath, &stat); - if (err) - goto out_dput_parent; - stat.size = 0; - err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getattr(&lowerpath, &stat); + if (!err) { + stat.size = 0; + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); + } + revert_creds(old_cred); out_dput_parent: dput(parent); @@ -153,45 +156,18 @@ static const char *ovl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct dentry *realdentry; - struct inode *realinode; const struct cred *old_cred; const char *p; if (!dentry) return ERR_PTR(-ECHILD); - realdentry = ovl_dentry_real(dentry); - realinode = realdentry->d_inode; - - if (WARN_ON(!realinode->i_op->get_link)) - return ERR_PTR(-EPERM); - old_cred = ovl_override_creds(dentry->d_sb); - p = realinode->i_op->get_link(realdentry, realinode, done); + p = vfs_get_link(ovl_dentry_real(dentry), done); revert_creds(old_cred); return p; } -static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) -{ - struct path realpath; - struct inode *realinode; - const struct cred *old_cred; - int err; - - ovl_path_real(dentry, &realpath); - realinode = realpath.dentry->d_inode; - - if (!realinode->i_op->readlink) - return -EINVAL; - - old_cred = ovl_override_creds(dentry->d_sb); - err = realinode->i_op->readlink(realpath.dentry, buf, bufsiz); - revert_creds(old_cred); - return err; -} - bool ovl_is_private_xattr(const char *name) { return strncmp(name, OVL_XATTR_PREFIX, @@ -375,7 +351,7 @@ static const struct inode_operations ovl_file_inode_operations = { static const struct inode_operations ovl_symlink_inode_operations = { .setattr = ovl_setattr, .get_link = ovl_get_link, - .readlink = ovl_readlink, + .readlink = generic_readlink, .getattr = ovl_getattr, .listxattr = ovl_listxattr, .update_time = ovl_update_time, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7e3f0127fc1a..bcf3965be819 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -273,12 +273,11 @@ static bool ovl_is_opaquedir(struct dentry *dentry) { int res; char val; - struct inode *inode = dentry->d_inode; - if (!S_ISDIR(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) + if (!d_is_dir(dentry)) return false; - res = __vfs_getxattr(dentry, inode, OVL_XATTR_OPAQUE, &val, 1); + res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); if (res == 1 && val == 'y') return true; @@ -419,16 +418,12 @@ static bool ovl_dentry_weird(struct dentry *dentry) DCACHE_OP_COMPARE); } -static inline struct dentry *ovl_lookup_real(struct super_block *ovl_sb, - struct dentry *dir, +static inline struct dentry *ovl_lookup_real(struct dentry *dir, const struct qstr *name) { - const struct cred *old_cred; struct dentry *dentry; - old_cred = ovl_override_creds(ovl_sb); dentry = lookup_one_len_unlocked(name->name, dir, name->len); - revert_creds(old_cred); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) @@ -469,6 +464,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe; + const struct cred *old_cred; struct ovl_entry *poe = dentry->d_parent->d_fsdata; struct path *stack = NULL; struct dentry *upperdir, *upperdentry = NULL; @@ -479,9 +475,10 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int i; int err; + old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_upperdentry_dereference(poe); if (upperdir) { - this = ovl_lookup_real(dentry->d_sb, upperdir, &dentry->d_name); + this = ovl_lookup_real(upperdir, &dentry->d_name); err = PTR_ERR(this); if (IS_ERR(this)) goto out; @@ -514,8 +511,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, bool opaque = false; struct path lowerpath = poe->lowerstack[i]; - this = ovl_lookup_real(dentry->d_sb, - lowerpath.dentry, &dentry->d_name); + this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); err = PTR_ERR(this); if (IS_ERR(this)) { /* @@ -588,6 +584,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ovl_copyattr(realdentry->d_inode, inode); } + revert_creds(old_cred); oe->opaque = upperopaque; oe->__upperdentry = upperdentry; memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); @@ -606,6 +603,7 @@ out_put: out_put_upper: dput(upperdentry); out: + revert_creds(old_cred); return ERR_PTR(err); } @@ -834,6 +832,19 @@ retry: if (err) goto out_dput; + /* + * Try to remove POSIX ACL xattrs from workdir. We are good if: + * + * a) success (there was a POSIX ACL xattr and was removed) + * b) -ENODATA (there was no POSIX ACL xattr) + * c) -EOPNOTSUPP (POSIX ACL xattrs are not supported) + * + * There are various other error values that could effectively + * mean that the xattr doesn't exist (e.g. -ERANGE is returned + * if the xattr name is too long), but the set of filesystems + * allowed as upper are limited to "normal" ones, where checking + * for the above two errors is sufficient. + */ err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; @@ -1292,6 +1303,12 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!oe) goto out_put_cred; + sb->s_magic = OVERLAYFS_SUPER_MAGIC; + sb->s_op = &ovl_super_operations; + sb->s_xattr = ovl_xattr_handlers; + sb->s_fs_info = ufs; + sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK; + root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR)); if (!root_dentry) goto out_free_oe; @@ -1315,12 +1332,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ovl_inode_init(d_inode(root_dentry), realinode, !!upperpath.dentry); ovl_copyattr(realinode, d_inode(root_dentry)); - sb->s_magic = OVERLAYFS_SUPER_MAGIC; - sb->s_op = &ovl_super_operations; - sb->s_xattr = ovl_xattr_handlers; sb->s_root = root_dentry; - sb->s_fs_info = ufs; - sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK; return 0; diff --git a/fs/proc/array.c b/fs/proc/array.c index 89600fd5963d..81818adb8e9e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -412,10 +412,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); - if (permitted) { - eip = KSTK_EIP(task); - esp = KSTK_ESP(task); - } + /* + * esp and eip are intentionally zeroed out. There is no + * non-racy way to read them without freezing the task. + * Programs that need reliable values can use ptrace(2). + */ } get_task_comm(tcomm, task); diff --git a/fs/proc/base.c b/fs/proc/base.c index c2964d890c9a..ca651ac00660 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -832,6 +832,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, unsigned long addr = *ppos; ssize_t copied; char *page; + unsigned int flags; if (!mm) return 0; @@ -844,6 +845,11 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + /* Maybe we should limit FOLL_FORCE to actual ptrace users? */ + flags = FOLL_FORCE; + if (write) + flags |= FOLL_WRITE; + while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); @@ -852,7 +858,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, break; } - this_len = access_remote_vm(mm, addr, page, this_len, write); + this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; @@ -964,8 +970,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (env_start + src), - page, this_len, 0); + retval = access_remote_vm(mm, (env_start + src), page, this_len, 0); if (retval <= 0) { ret = retval; @@ -1007,6 +1012,9 @@ static ssize_t auxv_read(struct file *file, char __user *buf, { struct mm_struct *mm = file->private_data; unsigned int nwords = 0; + + if (!mm) + return 0; do { nwords += 2; } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6909582ce5e5..35b92d81692f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -266,24 +266,15 @@ static int do_maps_open(struct inode *inode, struct file *file, * /proc/PID/maps that is the stack of the main task. */ static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, int is_pid) + struct vm_area_struct *vma) { - int stack = 0; - - if (is_pid) { - stack = vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; - } else { - struct inode *inode = priv->inode; - struct task_struct *task; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) - stack = vma_is_stack_for_task(vma, task); - rcu_read_unlock(); - } - return stack; + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; } static void @@ -354,7 +345,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - if (is_stack(priv, vma, is_pid)) + if (is_stack(priv, vma)) name = "[stack]"; } @@ -1669,7 +1660,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); - } else if (is_stack(proc_priv, vma, is_pid)) { + } else if (is_stack(proc_priv, vma)) { seq_puts(m, " stack"); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index faacb0c0d857..37175621e890 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -124,25 +124,17 @@ unsigned long task_statm(struct mm_struct *mm, } static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, int is_pid) + struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; - int stack = 0; - - if (is_pid) { - stack = vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack; - } else { - struct inode *inode = priv->inode; - struct task_struct *task; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) - stack = vma_is_stack_for_task(vma, task); - rcu_read_unlock(); - } - return stack; + + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack; } /* @@ -184,7 +176,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm && is_stack(priv, vma, is_pid)) { + } else if (mm && is_stack(priv, vma)) { seq_pad(m, ' '); seq_printf(m, "[stack]"); } diff --git a/fs/read_write.c b/fs/read_write.c index 66215a7b17cf..190e0d362581 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -730,6 +730,35 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, /* A write operation does a read from user space and vice versa */ #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) +/** + * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace + * into the kernel and check that it is valid. + * + * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE. + * @uvector: Pointer to the userspace array. + * @nr_segs: Number of elements in userspace array. + * @fast_segs: Number of elements in @fast_pointer. + * @fast_pointer: Pointer to (usually small on-stack) kernel array. + * @ret_pointer: (output parameter) Pointer to a variable that will point to + * either @fast_pointer, a newly allocated kernel array, or NULL, + * depending on which array was used. + * + * This function copies an array of &struct iovec of @nr_segs from + * userspace into the kernel and checks that each element is valid (e.g. + * it does not point to a kernel address or cause overflow by being too + * large, etc.). + * + * As an optimization, the caller may provide a pointer to a small + * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long + * (the size of this array, or 0 if unused, should be given in @fast_segs). + * + * @ret_pointer will always point to the array that was used, so the + * caller must take care not to call kfree() on it e.g. in case the + * @fast_pointer array was used and it was allocated on the stack. + * + * Return: The total number of bytes covered by the iovec array on success + * or a negative error code on error. + */ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, diff --git a/fs/super.c b/fs/super.c index c2ff475c1711..c183835566c1 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1269,25 +1269,34 @@ EXPORT_SYMBOL(__sb_start_write); static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); - /* - * We are going to return to userspace and forget about this lock, the - * ownership goes to the caller of thaw_super() which does unlock. - * - * FIXME: we should do this before return from freeze_super() after we - * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super() - * should re-acquire these locks before s_op->unfreeze_fs(sb). However - * this leads to lockdep false-positives, so currently we do the early - * release right after acquire. - */ - percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); } -static void sb_freeze_unlock(struct super_block *sb) +/* + * We are going to return to userspace and forget about these locks, the + * ownership goes to the caller of thaw_super() which does unlock(). + */ +static void lockdep_sb_freeze_release(struct super_block *sb) +{ + int level; + + for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_); +} + +/* + * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). + */ +static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); +} + +static void sb_freeze_unlock(struct super_block *sb) +{ + int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); @@ -1379,10 +1388,11 @@ int freeze_super(struct super_block *sb) } } /* - * This is just for debugging purposes so that fs can warn if it - * sees write activity when frozen is set to SB_FREEZE_COMPLETE. + * For debugging purposes so that fs can warn if it sees write activity + * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ sb->s_writers.frozen = SB_FREEZE_COMPLETE; + lockdep_sb_freeze_release(sb); up_write(&sb->s_umount); return 0; } @@ -1399,7 +1409,7 @@ int thaw_super(struct super_block *sb) int error; down_write(&sb->s_umount); - if (sb->s_writers.frozen == SB_UNFROZEN) { + if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { up_write(&sb->s_umount); return -EINVAL; } @@ -1409,11 +1419,14 @@ int thaw_super(struct super_block *sb) goto out; } + lockdep_sb_freeze_acquire(sb); + if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); + lockdep_sb_freeze_release(sb); up_write(&sb->s_umount); return error; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 94374e435025..2b67bda2021b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -21,14 +21,14 @@ DEFINE_SPINLOCK(sysfs_symlink_target_lock); void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { - char *buf, *path = NULL; + char *buf; buf = kzalloc(PATH_MAX, GFP_KERNEL); if (buf) - path = kernfs_path(parent, buf, PATH_MAX); + kernfs_path(parent, buf, PATH_MAX); WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n", - path, name); + buf, name); kfree(buf); } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index c8f60df2733e..ca16c5d7bab1 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -439,7 +439,7 @@ static unsigned int vfs_dent_type(uint8_t type) */ static int ubifs_readdir(struct file *file, struct dir_context *ctx) { - int err; + int err = 0; struct qstr nm; union ubifs_key key; struct ubifs_dent_node *dent; @@ -541,14 +541,20 @@ out: kfree(file->private_data); file->private_data = NULL; - if (err != -ENOENT) { + if (err != -ENOENT) ubifs_err(c, "cannot find next direntry, error %d", err); - return err; - } + else + /* + * -ENOENT is a non-fatal error in this context, the TNC uses + * it to indicate that the cursor moved past the current directory + * and readdir() has to stop. + */ + err = 0; + /* 2 is a special value indicating that there are no more direntries */ ctx->pos = 2; - return 0; + return err; } /* Free saved readdir() state when the directory is closed */ @@ -1060,9 +1066,9 @@ static void unlock_4_inodes(struct inode *inode1, struct inode *inode2, mutex_unlock(&ubifs_inode(inode1)->ui_mutex); } -static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int do_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct ubifs_info *c = old_dir->i_sb->s_fs_info; struct inode *old_inode = d_inode(old_dentry); @@ -1323,7 +1329,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, return err; } -static int ubifs_rename2(struct inode *old_dir, struct dentry *old_dentry, +static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -1336,7 +1342,7 @@ static int ubifs_rename2(struct inode *old_dir, struct dentry *old_dentry, if (flags & RENAME_EXCHANGE) return ubifs_xrename(old_dir, old_dentry, new_dir, new_dentry); - return ubifs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); + return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, @@ -1387,7 +1393,7 @@ const struct inode_operations ubifs_dir_inode_operations = { .mkdir = ubifs_mkdir, .rmdir = ubifs_rmdir, .mknod = ubifs_mknod, - .rename = ubifs_rename2, + .rename = ubifs_rename, .setattr = ubifs_setattr, .getattr = ubifs_getattr, .listxattr = ubifs_listxattr, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 6c2f4d41ed73..d9f9615bfd71 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -172,6 +172,7 @@ out_cancel: host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); host_ui->xattr_size -= CALC_XATTR_BYTES(size); + host_ui->xattr_names -= nm->len; mutex_unlock(&host_ui->ui_mutex); out_free: make_bad_inode(inode); @@ -478,6 +479,7 @@ out_cancel: host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(nm->len); host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_names += nm->len; mutex_unlock(&host_ui->ui_mutex); ubifs_release_budget(c, &req); make_bad_inode(inode); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c27344cf38e1..c6eb21940783 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3974,9 +3974,6 @@ xfs_bmap_remap_alloc( * allocating, so skip that check by pretending to be freeing. */ error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); - if (error) - goto error0; -error0: xfs_perag_put(args.pag); if (error) trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_); @@ -3999,6 +3996,39 @@ xfs_bmap_alloc( return xfs_bmap_btalloc(ap); } +/* Trim extent to fit a logical block range. */ +void +xfs_trim_extent( + struct xfs_bmbt_irec *irec, + xfs_fileoff_t bno, + xfs_filblks_t len) +{ + xfs_fileoff_t distance; + xfs_fileoff_t end = bno + len; + + if (irec->br_startoff + irec->br_blockcount <= bno || + irec->br_startoff >= end) { + irec->br_blockcount = 0; + return; + } + + if (irec->br_startoff < bno) { + distance = bno - irec->br_startoff; + if (isnullstartblock(irec->br_startblock)) + irec->br_startblock = DELAYSTARTBLOCK; + if (irec->br_startblock != DELAYSTARTBLOCK && + irec->br_startblock != HOLESTARTBLOCK) + irec->br_startblock += distance; + irec->br_startoff += distance; + irec->br_blockcount -= distance; + } + + if (end < irec->br_startoff + irec->br_blockcount) { + distance = irec->br_startoff + irec->br_blockcount - end; + irec->br_blockcount -= distance; + } +} + /* * Trim the returned map to the required bounds */ @@ -4829,6 +4859,219 @@ xfs_bmap_split_indlen( return stolen; } +int +xfs_bmap_del_extent_delay( + struct xfs_inode *ip, + int whichfork, + xfs_extnum_t *idx, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *del) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec new; + int64_t da_old, da_new, da_diff = 0; + xfs_fileoff_t del_endoff, got_endoff; + xfs_filblks_t got_indlen, new_indlen, stolen; + int error = 0, state = 0; + bool isrt; + + XFS_STATS_INC(mp, xs_del_exlist); + + isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got->br_startoff + got->br_blockcount; + da_old = startblockval(got->br_startblock); + da_new = 0; + + ASSERT(*idx >= 0); + ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(del->br_blockcount > 0); + ASSERT(got->br_startoff <= del->br_startoff); + ASSERT(got_endoff >= del_endoff); + + if (isrt) { + int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); + + do_div(rtexts, mp->m_sb.sb_rextsize); + xfs_mod_frextents(mp, rtexts); + } + + /* + * Update the inode delalloc counter now and wait to update the + * sb counters as we might have to borrow some blocks for the + * indirect block accounting. + */ + xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0, + isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + ip->i_delayed_blks -= del->br_blockcount; + + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; + + if (got->br_startoff == del->br_startoff) + state |= BMAP_LEFT_CONTIG; + if (got_endoff == del_endoff) + state |= BMAP_RIGHT_CONTIG; + + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, state); + --*idx; + break; + case BMAP_LEFT_CONTIG: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_startoff = del_endoff; + got->br_blockcount -= del->br_blockcount; + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, + got->br_blockcount), da_old); + got->br_startblock = nullstartblock((int)da_new); + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case BMAP_RIGHT_CONTIG: + /* + * Deleting the last part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_blockcount = got->br_blockcount - del->br_blockcount; + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, + got->br_blockcount), da_old); + got->br_startblock = nullstartblock((int)da_new); + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case 0: + /* + * Deleting the middle of the extent. + * + * Distribute the original indlen reservation across the two new + * extents. Steal blocks from the deleted extent if necessary. + * Stealing blocks simply fudges the fdblocks accounting below. + * Warn if either of the new indlen reservations is zero as this + * can lead to delalloc problems. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + + got->br_blockcount = del->br_startoff - got->br_startoff; + got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount); + + new.br_blockcount = got_endoff - del_endoff; + new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount); + + WARN_ON_ONCE(!got_indlen || !new_indlen); + stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen, + del->br_blockcount); + + got->br_startblock = nullstartblock((int)got_indlen); + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_); + + new.br_startoff = del_endoff; + new.br_state = got->br_state; + new.br_startblock = nullstartblock((int)new_indlen); + + ++*idx; + xfs_iext_insert(ip, *idx, 1, &new, state); + + da_new = got_indlen + new_indlen - stolen; + del->br_blockcount -= stolen; + break; + } + + ASSERT(da_old >= da_new); + da_diff = da_old - da_new; + if (!isrt) + da_diff += del->br_blockcount; + if (da_diff) + xfs_mod_fdblocks(mp, da_diff, false); + return error; +} + +void +xfs_bmap_del_extent_cow( + struct xfs_inode *ip, + xfs_extnum_t *idx, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *del) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec new; + xfs_fileoff_t del_endoff, got_endoff; + int state = BMAP_COWFORK; + + XFS_STATS_INC(mp, xs_del_exlist); + + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got->br_startoff + got->br_blockcount; + + ASSERT(*idx >= 0); + ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(del->br_blockcount > 0); + ASSERT(got->br_startoff <= del->br_startoff); + ASSERT(got_endoff >= del_endoff); + ASSERT(!isnullstartblock(got->br_startblock)); + + if (got->br_startoff == del->br_startoff) + state |= BMAP_LEFT_CONTIG; + if (got_endoff == del_endoff) + state |= BMAP_RIGHT_CONTIG; + + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, state); + --*idx; + break; + case BMAP_LEFT_CONTIG: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_startoff = del_endoff; + got->br_blockcount -= del->br_blockcount; + got->br_startblock = del->br_startblock + del->br_blockcount; + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case BMAP_RIGHT_CONTIG: + /* + * Deleting the last part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_blockcount -= del->br_blockcount; + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case 0: + /* + * Deleting the middle of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_blockcount = del->br_startoff - got->br_startoff; + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + new.br_startoff = del_endoff; + new.br_blockcount = got_endoff - del_endoff; + new.br_state = got->br_state; + new.br_startblock = del->br_startblock + del->br_blockcount; + + ++*idx; + xfs_iext_insert(ip, *idx, 1, &new, state); + break; + } +} + /* * Called by xfs_bmapi to update file extent records and the btree * after removing space (or undoing a delayed allocation). @@ -5171,175 +5414,6 @@ done: return error; } -/* Remove an extent from the CoW fork. Similar to xfs_bmap_del_extent. */ -int -xfs_bunmapi_cow( - struct xfs_inode *ip, - struct xfs_bmbt_irec *del) -{ - xfs_filblks_t da_new; - xfs_filblks_t da_old; - xfs_fsblock_t del_endblock = 0; - xfs_fileoff_t del_endoff; - int delay; - struct xfs_bmbt_rec_host *ep; - int error; - struct xfs_bmbt_irec got; - xfs_fileoff_t got_endoff; - struct xfs_ifork *ifp; - struct xfs_mount *mp; - xfs_filblks_t nblks; - struct xfs_bmbt_irec new; - /* REFERENCED */ - uint qfield; - xfs_filblks_t temp; - xfs_filblks_t temp2; - int state = BMAP_COWFORK; - int eof; - xfs_extnum_t eidx; - - mp = ip->i_mount; - XFS_STATS_INC(mp, xs_del_exlist); - - ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof, - &eidx, &got, &new); - - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ifp = ifp; - ASSERT((eidx >= 0) && (eidx < ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(del->br_blockcount > 0); - ASSERT(got.br_startoff <= del->br_startoff); - del_endoff = del->br_startoff + del->br_blockcount; - got_endoff = got.br_startoff + got.br_blockcount; - ASSERT(got_endoff >= del_endoff); - delay = isnullstartblock(got.br_startblock); - ASSERT(isnullstartblock(del->br_startblock) == delay); - qfield = 0; - error = 0; - /* - * If deleting a real allocation, must free up the disk space. - */ - if (!delay) { - nblks = del->br_blockcount; - qfield = XFS_TRANS_DQ_BCOUNT; - /* - * Set up del_endblock and cur for later. - */ - del_endblock = del->br_startblock + del->br_blockcount; - da_old = da_new = 0; - } else { - da_old = startblockval(got.br_startblock); - da_new = 0; - nblks = 0; - } - qfield = qfield; - nblks = nblks; - - /* - * Set flag value to use in switch statement. - * Left-contig is 2, right-contig is 1. - */ - switch (((got.br_startoff == del->br_startoff) << 1) | - (got_endoff == del_endoff)) { - case 3: - /* - * Matches the whole extent. Delete the entry. - */ - xfs_iext_remove(ip, eidx, 1, BMAP_COWFORK); - --eidx; - break; - - case 2: - /* - * Deleting the first part of the extent. - */ - trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); - xfs_bmbt_set_startoff(ep, del_endoff); - temp = got.br_blockcount - del->br_blockcount; - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); - da_new = temp; - break; - } - xfs_bmbt_set_startblock(ep, del_endblock); - trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); - break; - - case 1: - /* - * Deleting the last part of the extent. - */ - temp = got.br_blockcount - del->br_blockcount; - trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); - da_new = temp; - break; - } - trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); - break; - - case 0: - /* - * Deleting the middle of the extent. - */ - temp = del->br_startoff - got.br_startoff; - trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - new.br_startoff = del_endoff; - temp2 = got_endoff - del_endoff; - new.br_blockcount = temp2; - new.br_state = got.br_state; - if (!delay) { - new.br_startblock = del_endblock; - } else { - temp = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - temp2 = xfs_bmap_worst_indlen(ip, temp2); - new.br_startblock = nullstartblock((int)temp2); - da_new = temp + temp2; - while (da_new > da_old) { - if (temp) { - temp--; - da_new--; - xfs_bmbt_set_startblock(ep, - nullstartblock((int)temp)); - } - if (da_new == da_old) - break; - if (temp2) { - temp2--; - da_new--; - new.br_startblock = - nullstartblock((int)temp2); - } - } - } - trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); - xfs_iext_insert(ip, eidx + 1, 1, &new, state); - ++eidx; - break; - } - - /* - * Account for change in delayed indirect blocks. - * Nothing to do for disk quota accounting here. - */ - ASSERT(da_old >= da_new); - if (da_old > da_new) - xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); - - return error; -} - /* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f97db7132564..7cae6ec27fa6 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -190,6 +190,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, #define XFS_BMAP_TRACE_EXLIST(ip,c,w) #endif +void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, + xfs_filblks_t len); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, @@ -221,7 +223,11 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_extnum_t nexts, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops, int *done); -int xfs_bunmapi_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *del); +int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, + xfs_extnum_t *idx, struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *del); +void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx, + struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5c8e6f2ce44f..0e80993c8a59 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4826,7 +4826,7 @@ xfs_btree_calc_size( return rval; } -int +static int xfs_btree_count_blocks_helper( struct xfs_btree_cur *cur, int level, diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 3cc3cf767474..ac9a003dd29a 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -191,8 +191,7 @@ xfs_dquot_buf_verify_crc( if (mp->m_quotainfo) ndquots = mp->m_quotainfo->qi_dqperchunk; else - ndquots = xfs_calc_dquots_per_chunk( - XFS_BB_TO_FSB(mp, bp->b_length)); + ndquots = xfs_calc_dquots_per_chunk(bp->b_length); for (i = 0; i < ndquots; i++, d++) { if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index f6547fc5e016..6b7579e7b60a 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -865,7 +865,6 @@ typedef struct xfs_timestamp { * padding field for v3 inodes. */ #define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) typedef struct xfs_dinode { __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ __be16 di_mode; /* mode and type of file */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 8de9a3a29589..134424fac434 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -57,6 +57,17 @@ xfs_inobp_check( } #endif +bool +xfs_dinode_good_version( + struct xfs_mount *mp, + __u8 version) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return version == 3; + + return version == 1 || version == 2; +} + /* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence @@ -91,7 +102,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - XFS_DINODE_GOOD_VERSION(dip->di_version); + xfs_dinode_good_version(mp, dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, XFS_RANDOM_ITOBP_INOTOBP))) { diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 62d9d4681c8c..3cfe12a4f58a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -74,6 +74,8 @@ void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); +bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version); + #if defined(DEBUG) void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #else diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a314fc7b56fa..6e4f7f900fea 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -249,6 +249,7 @@ xfs_file_dio_aio_read( struct xfs_inode *ip = XFS_I(inode); loff_t isize = i_size_read(inode); size_t count = iov_iter_count(to); + loff_t end = iocb->ki_pos + count - 1; struct iov_iter data; struct xfs_buftarg *target; ssize_t ret = 0; @@ -272,49 +273,21 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); - /* - * Locking is a bit tricky here. If we take an exclusive lock for direct - * IO, we effectively serialise all new concurrent read IO to this file - * and block it behind IO that is currently in progress because IO in - * progress holds the IO lock shared. We only need to hold the lock - * exclusive to blow away the page cache, so only take lock exclusively - * if the page cache needs invalidation. This allows the normal direct - * IO case of no page cache pages to proceeed concurrently without - * serialisation. - */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if (mapping->nrpages) { - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); - xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); + if (ret) + goto out_unlock; /* - * The generic dio code only flushes the range of the particular - * I/O. Because we take an exclusive lock here, this whole - * sequence is considerably more expensive for us. This has a - * noticeable performance impact for any file with cached pages, - * even when outside of the range of the particular I/O. - * - * Hence, amortize the cost of the lock against a full file - * flush and reduce the chances of repeated iolock cycles going - * forward. + * Invalidate whole pages. This can return an error if we fail + * to invalidate a page, but this should never happen on XFS. + * Warn if it does fail. */ - if (mapping->nrpages) { - ret = filemap_write_and_wait(mapping); - if (ret) { - xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; - } - - /* - * Invalidate whole pages. This can return an error if - * we fail to invalidate a page, but this should never - * happen on XFS. Warn if it does fail. - */ - ret = invalidate_inode_pages2(mapping); - WARN_ON_ONCE(ret); - ret = 0; - } - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; } data = *to; @@ -324,8 +297,9 @@ xfs_file_dio_aio_read( iocb->ki_pos += ret; iov_iter_advance(to, ret); } - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); +out_unlock: + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -570,61 +544,49 @@ xfs_file_dio_aio_write( if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - /* "unaligned" here means not aligned to a filesystem block */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + count) & mp->m_blockmask)) - unaligned_io = 1; - /* - * We don't need to take an exclusive lock unless there page cache needs - * to be invalidated or unaligned IO is being executed. We don't need to - * consider the EOF extension case here because - * xfs_file_aio_write_checks() will relock the inode as necessary for - * EOF zeroing cases and fill out the new inode size as appropriate. + * Don't take the exclusive iolock here unless the I/O is unaligned to + * the file system block size. We don't need to consider the EOF + * extension case here because xfs_file_aio_write_checks() will relock + * the inode as necessary for EOF zeroing cases and fill out the new + * inode size as appropriate. */ - if (unaligned_io || mapping->nrpages) + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + count) & mp->m_blockmask)) { + unaligned_io = 1; iolock = XFS_IOLOCK_EXCL; - else + } else { iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, iolock); - - /* - * Recheck if there are cached pages that need invalidate after we got - * the iolock to protect against other threads adding new pages while - * we were waiting for the iolock. - */ - if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, iolock); - iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); } + xfs_rw_ilock(ip, iolock); + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); end = iocb->ki_pos + count - 1; - /* - * See xfs_file_dio_aio_read() for why we do a full-file flush here. - */ if (mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); if (ret) goto out; + /* * Invalidate whole pages. This can return an error if we fail * to invalidate a page, but this should never happen on XFS. * Warn if it does fail. */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); WARN_ON_ONCE(ret); ret = 0; } /* * If we are doing unaligned IO, wait for all other IO to drain, - * otherwise demote the lock if we had to flush cached pages + * otherwise demote the lock if we had to take the exclusive lock + * for other reasons in xfs_file_aio_write_checks. */ if (unaligned_io) inode_dio_wait(inode); @@ -947,134 +909,6 @@ out_unlock: return error; } -/* - * Flush all file writes out to disk. - */ -static int -xfs_file_wait_for_io( - struct inode *inode, - loff_t offset, - size_t len) -{ - loff_t rounding; - loff_t ioffset; - loff_t iendoffset; - loff_t bs; - int ret; - - bs = inode->i_sb->s_blocksize; - inode_dio_wait(inode); - - rounding = max_t(xfs_off_t, bs, PAGE_SIZE); - ioffset = round_down(offset, rounding); - iendoffset = round_up(offset + len, rounding) - 1; - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - iendoffset); - return ret; -} - -/* Hook up to the VFS reflink function */ -STATIC int -xfs_file_share_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len, - bool is_dedupe) -{ - struct inode *inode_in; - struct inode *inode_out; - ssize_t ret; - loff_t bs; - loff_t isize; - int same_inode; - loff_t blen; - unsigned int flags = 0; - - inode_in = file_inode(file_in); - inode_out = file_inode(file_out); - bs = inode_out->i_sb->s_blocksize; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - if (IS_SWAPFILE(inode_in) || - IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Reflink only works within this filesystem. */ - if (inode_in->i_sb != inode_out->i_sb) - return -EXDEV; - same_inode = (inode_in->i_ino == inode_out->i_ino); - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) - return -EINVAL; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Don't share DAX file data for now. */ - if (IS_DAX(inode_in) || IS_DAX(inode_out)) - return -EINVAL; - - /* Are we going all the way to the end? */ - isize = i_size_read(inode_in); - if (isize == 0) - return 0; - if (len == 0) - len = isize - pos_in; - - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + len < pos_in || pos_out + len < pos_out || - pos_in + len > isize) - return -EINVAL; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + len > disize) - return -EINVAL; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - return -EINVAL; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) - return -EINVAL; - - /* Wait for the completion of any pending IOs on srcfile */ - ret = xfs_file_wait_for_io(inode_in, pos_in, len); - if (ret) - goto out; - ret = xfs_file_wait_for_io(inode_out, pos_out, len); - if (ret) - goto out; - - if (is_dedupe) - flags |= XFS_REFLINK_DEDUPE; - ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out), - pos_out, len, flags); - if (ret < 0) - goto out; - -out: - return ret; -} - STATIC ssize_t xfs_file_copy_range( struct file *file_in, @@ -1086,7 +920,7 @@ xfs_file_copy_range( { int error; - error = xfs_file_share_range(file_in, pos_in, file_out, pos_out, + error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, len, false); if (error) return error; @@ -1101,7 +935,7 @@ xfs_file_clone_range( loff_t pos_out, u64 len) { - return xfs_file_share_range(file_in, pos_in, file_out, pos_out, + return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, len, false); } @@ -1124,7 +958,7 @@ xfs_file_dedupe_range( if (len > XFS_MAX_DEDUPE_LEN) len = XFS_MAX_DEDUPE_LEN; - error = xfs_file_share_range(src_file, loff, dst_file, dst_loff, + error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, len, true); if (error) return error; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 14796b744e0a..f295049db681 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1656,9 +1656,9 @@ void xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { - trace_xfs_inode_set_eofblocks_tag(ip); + trace_xfs_inode_set_cowblocks_tag(ip); return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks, - trace_xfs_perag_set_eofblocks, + trace_xfs_perag_set_cowblocks, XFS_ICI_COWBLOCKS_TAG); } @@ -1666,7 +1666,7 @@ void xfs_inode_clear_cowblocks_tag( xfs_inode_t *ip) { - trace_xfs_inode_clear_eofblocks_tag(ip); + trace_xfs_inode_clear_cowblocks_tag(ip); return __xfs_inode_clear_eofblocks_tag(ip, - trace_xfs_perag_clear_eofblocks, XFS_ICI_COWBLOCKS_TAG); + trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d907eb9f8ef3..436e109bb01e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -566,6 +566,17 @@ xfs_file_iomap_begin_delay( xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx, &got, &prev); if (!eof && got.br_startoff <= offset_fsb) { + if (xfs_is_reflink_inode(ip)) { + bool shared; + + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), + maxbytes_fsb); + xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); + error = xfs_reflink_reserve_cow(ip, &got, &shared); + if (error) + goto out_unlock; + } + trace_xfs_iomap_found(ip, offset, count, 0, &got); goto done; } @@ -961,19 +972,13 @@ xfs_file_iomap_begin( struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; - bool shared, trimmed; int nimaps = 1, error = 0; + bool shared = false, trimmed = false; unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { - error = xfs_reflink_reserve_cow_range(ip, offset, length); - if (error < 0) - return error; - } - if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ @@ -981,7 +986,16 @@ xfs_file_iomap_begin( iomap); } - lockmode = xfs_ilock_data_map_shared(ip); + /* + * COW writes will allocate delalloc space, so we need to make sure + * to take the lock exclusively here. + */ + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, XFS_ILOCK_EXCL); + } else { + lockmode = xfs_ilock_data_map_shared(ip); + } ASSERT(offset <= mp->m_super->s_maxbytes); if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) @@ -991,16 +1005,24 @@ xfs_file_iomap_begin( error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); - if (error) { - xfs_iunlock(ip, lockmode); - return error; + if (error) + goto out_unlock; + + if (flags & IOMAP_REPORT) { + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, + &trimmed); + if (error) + goto out_unlock; } - /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); - if (error) { - xfs_iunlock(ip, lockmode); - return error; + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + error = xfs_reflink_reserve_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + + end_fsb = imap.br_startoff + imap.br_blockcount; + length = XFS_FSB_TO_B(mp, end_fsb) - offset; } if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { @@ -1039,6 +1061,9 @@ xfs_file_iomap_begin( if (shared) iomap->flags |= IOMAP_F_SHARED; return 0; +out_unlock: + xfs_iunlock(ip, lockmode); + return error; } static int diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fc7873942bea..b341f10cf481 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1009,6 +1009,7 @@ xfs_mountfs( out_quota: xfs_qm_unmount_quotas(mp); out_rtunmount: + mp->m_super->s_flags &= ~MS_ACTIVE; xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 5965e9455d91..a279b4e7f5fe 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -182,7 +182,8 @@ xfs_reflink_trim_around_shared( if (!xfs_is_reflink_inode(ip) || ISUNWRITTEN(irec) || irec->br_startblock == HOLESTARTBLOCK || - irec->br_startblock == DELAYSTARTBLOCK) { + irec->br_startblock == DELAYSTARTBLOCK || + isnullstartblock(irec->br_startblock)) { *shared = false; return 0; } @@ -227,50 +228,54 @@ xfs_reflink_trim_around_shared( } } -/* Create a CoW reservation for a range of blocks within a file. */ -static int -__xfs_reflink_reserve_cow( +/* + * Trim the passed in imap to the next shared/unshared extent boundary, and + * if imap->br_startoff points to a shared extent reserve space for it in the + * COW fork. In this case *shared is set to true, else to false. + * + * Note that imap will always contain the block numbers for the existing blocks + * in the data fork, as the upper layers need them for read-modify-write + * operations. + */ +int +xfs_reflink_reserve_cow( struct xfs_inode *ip, - xfs_fileoff_t *offset_fsb, - xfs_fileoff_t end_fsb, - bool *skipped) + struct xfs_bmbt_irec *imap, + bool *shared) { - struct xfs_bmbt_irec got, prev, imap; - xfs_fileoff_t orig_end_fsb; - int nimaps, eof = 0, error = 0; - bool shared = false, trimmed = false; + struct xfs_bmbt_irec got, prev; + xfs_fileoff_t end_fsb, orig_end_fsb; + int eof = 0, error = 0; + bool trimmed; xfs_extnum_t idx; xfs_extlen_t align; - /* Already reserved? Skip the refcount btree access. */ - xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx, + /* + * Search the COW fork extent list first. This serves two purposes: + * first this implement the speculative preallocation using cowextisze, + * so that we also unshared block adjacent to shared blocks instead + * of just the shared blocks themselves. Second the lookup in the + * extent list is generally faster than going out to the shared extent + * tree. + */ + xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx, &got, &prev); - if (!eof && got.br_startoff <= *offset_fsb) { - end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount; - trace_xfs_reflink_cow_found(ip, &got); - goto done; - } + if (!eof && got.br_startoff <= imap->br_startoff) { + trace_xfs_reflink_cow_found(ip, imap); + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - /* Read extent from the source file. */ - nimaps = 1; - error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, - &imap, &nimaps, 0); - if (error) - goto out_unlock; - ASSERT(nimaps == 1); + *shared = true; + return 0; + } /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); + error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); if (error) - goto out_unlock; - - end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount; + return error; /* Not shared? Just report the (potentially capped) extent. */ - if (!shared) { - *skipped = true; - goto done; - } + if (!*shared) + return 0; /* * Fork all the shared blocks from our write offset until the end of @@ -278,72 +283,38 @@ __xfs_reflink_reserve_cow( */ error = xfs_qm_dqattach_locked(ip, 0); if (error) - goto out_unlock; + return error; + + end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount; align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); if (align) end_fsb = roundup_64(end_fsb, align); retry: - error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb, - end_fsb - *offset_fsb, &got, - &prev, &idx, eof); + error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, + end_fsb - imap->br_startoff, &got, &prev, &idx, eof); switch (error) { case 0: break; case -ENOSPC: case -EDQUOT: /* retry without any preallocation */ - trace_xfs_reflink_cow_enospc(ip, &imap); + trace_xfs_reflink_cow_enospc(ip, imap); if (end_fsb != orig_end_fsb) { end_fsb = orig_end_fsb; goto retry; } /*FALLTHRU*/ default: - goto out_unlock; + return error; } if (end_fsb != orig_end_fsb) xfs_inode_set_cowblocks_tag(ip); trace_xfs_reflink_cow_alloc(ip, &got); -done: - *offset_fsb = end_fsb; -out_unlock: - return error; -} - -/* Create a CoW reservation for part of a file. */ -int -xfs_reflink_reserve_cow_range( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t count) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb, end_fsb; - bool skipped = false; - int error; - - trace_xfs_reflink_reserve_cow_range(ip, offset, count); - - offset_fsb = XFS_B_TO_FSBT(mp, offset); - end_fsb = XFS_B_TO_FSB(mp, offset + count); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - while (offset_fsb < end_fsb) { - error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb, - &skipped); - if (error) { - trace_xfs_reflink_reserve_cow_range_error(ip, error, - _RET_IP_); - break; - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - return error; + return 0; } /* Allocate all CoW reservations covering a range of blocks in a file. */ @@ -358,9 +329,8 @@ __xfs_reflink_allocate_cow( struct xfs_defer_ops dfops; struct xfs_trans *tp; xfs_fsblock_t first_block; - xfs_fileoff_t next_fsb; int nimaps = 1, error; - bool skipped = false; + bool shared; xfs_defer_init(&dfops, &first_block); @@ -371,33 +341,38 @@ __xfs_reflink_allocate_cow( xfs_ilock(ip, XFS_ILOCK_EXCL); - next_fsb = *offset_fsb; - error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped); + /* Read extent from the source file. */ + nimaps = 1; + error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, + &imap, &nimaps, 0); + if (error) + goto out_unlock; + ASSERT(nimaps == 1); + + error = xfs_reflink_reserve_cow(ip, &imap, &shared); if (error) goto out_trans_cancel; - if (skipped) { - *offset_fsb = next_fsb; + if (!shared) { + *offset_fsb = imap.br_startoff + imap.br_blockcount; goto out_trans_cancel; } xfs_trans_ijoin(tp, ip, 0); - error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb, + error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, XFS_BMAPI_COWFORK, &first_block, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), &imap, &nimaps, &dfops); if (error) goto out_trans_cancel; - /* We might not have been able to map the whole delalloc extent */ - *offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb); - error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_trans_cancel; error = xfs_trans_commit(tp); + *offset_fsb = imap.br_startoff + imap.br_blockcount; out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -536,58 +511,49 @@ xfs_reflink_cancel_cow_blocks( xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb) { - struct xfs_bmbt_irec irec; - xfs_filblks_t count_fsb; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got, prev, del; + xfs_extnum_t idx; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error = 0; - int nimaps; + int error = 0, eof = 0; if (!xfs_is_reflink_inode(ip)) return 0; - /* Go find the old extent in the CoW fork. */ - while (offset_fsb < end_fsb) { - nimaps = 1; - count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); - error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, - &nimaps, XFS_BMAPI_COWFORK); - if (error) - break; - ASSERT(nimaps == 1); - - trace_xfs_reflink_cancel_cow(ip, &irec); + xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx, + &got, &prev); + if (eof) + return 0; - if (irec.br_startblock == DELAYSTARTBLOCK) { - /* Free a delayed allocation. */ - xfs_mod_fdblocks(ip->i_mount, irec.br_blockcount, - false); - ip->i_delayed_blks -= irec.br_blockcount; + while (got.br_startoff < end_fsb) { + del = got; + xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); + trace_xfs_reflink_cancel_cow(ip, &del); - /* Remove the mapping from the CoW fork. */ - error = xfs_bunmapi_cow(ip, &irec); + if (isnullstartblock(del.br_startblock)) { + error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, + &idx, &got, &del); if (error) break; - } else if (irec.br_startblock == HOLESTARTBLOCK) { - /* empty */ } else { xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); /* Free the CoW orphan record. */ error = xfs_refcount_free_cow_extent(ip->i_mount, - &dfops, irec.br_startblock, - irec.br_blockcount); + &dfops, del.br_startblock, + del.br_blockcount); if (error) break; xfs_bmap_add_free(ip->i_mount, &dfops, - irec.br_startblock, irec.br_blockcount, + del.br_startblock, del.br_blockcount, NULL); /* Update quota accounting */ xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, - -(long)irec.br_blockcount); + -(long)del.br_blockcount); /* Roll the transaction */ error = xfs_defer_finish(tpp, &dfops, ip); @@ -597,15 +563,18 @@ xfs_reflink_cancel_cow_blocks( } /* Remove the mapping from the CoW fork. */ - error = xfs_bunmapi_cow(ip, &irec); - if (error) - break; + xfs_bmap_del_extent_cow(ip, &idx, &got, &del); } - /* Roll on... */ - offset_fsb = irec.br_startoff + irec.br_blockcount; + if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)) + break; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } + /* clear tag if cow fork is emptied */ + if (!ifp->if_bytes) + xfs_inode_clear_cowblocks_tag(ip); + return error; } @@ -668,25 +637,26 @@ xfs_reflink_end_cow( xfs_off_t offset, xfs_off_t count) { - struct xfs_bmbt_irec irec; - struct xfs_bmbt_irec uirec; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got, prev, del; struct xfs_trans *tp; xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; - xfs_filblks_t count_fsb; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error; + int error, eof = 0; unsigned int resblks; - xfs_filblks_t ilen; xfs_filblks_t rlen; - int nimaps; + xfs_extnum_t idx; trace_xfs_reflink_end_cow(ip, offset, count); + /* No COW extents? That's easy! */ + if (ifp->if_bytes == 0) + return 0; + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); - count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); /* Start a rolling transaction to switch the mappings */ resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); @@ -698,72 +668,65 @@ xfs_reflink_end_cow( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - /* Go find the old extent in the CoW fork. */ - while (offset_fsb < end_fsb) { - /* Read extent from the source file */ - nimaps = 1; - count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); - error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, - &nimaps, XFS_BMAPI_COWFORK); - if (error) - goto out_cancel; - ASSERT(nimaps == 1); + xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx, + &got, &prev); - ASSERT(irec.br_startblock != DELAYSTARTBLOCK); - trace_xfs_reflink_cow_remap(ip, &irec); + /* If there is a hole at end_fsb - 1 go to the previous extent */ + if (eof || got.br_startoff > end_fsb) { + ASSERT(idx > 0); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got); + } - /* - * We can have a hole in the CoW fork if part of a directio - * write is CoW but part of it isn't. - */ - rlen = ilen = irec.br_blockcount; - if (irec.br_startblock == HOLESTARTBLOCK) + /* Walk backwards until we're out of the I/O range... */ + while (got.br_startoff + got.br_blockcount > offset_fsb) { + del = got; + xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); + + /* Extent delete may have bumped idx forward */ + if (!del.br_blockcount) { + idx--; goto next_extent; + } + + ASSERT(!isnullstartblock(got.br_startblock)); /* Unmap the old blocks in the data fork. */ - while (rlen) { - xfs_defer_init(&dfops, &firstfsb); - error = __xfs_bunmapi(tp, ip, irec.br_startoff, - &rlen, 0, 1, &firstfsb, &dfops); - if (error) - goto out_defer; - - /* - * Trim the extent to whatever got unmapped. - * Remember, bunmapi works backwards. - */ - uirec.br_startblock = irec.br_startblock + rlen; - uirec.br_startoff = irec.br_startoff + rlen; - uirec.br_blockcount = irec.br_blockcount - rlen; - irec.br_blockcount = rlen; - trace_xfs_reflink_cow_remap_piece(ip, &uirec); + xfs_defer_init(&dfops, &firstfsb); + rlen = del.br_blockcount; + error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1, + &firstfsb, &dfops); + if (error) + goto out_defer; - /* Free the CoW orphan record. */ - error = xfs_refcount_free_cow_extent(tp->t_mountp, - &dfops, uirec.br_startblock, - uirec.br_blockcount); - if (error) - goto out_defer; + /* Trim the extent to whatever got unmapped. */ + if (rlen) { + xfs_trim_extent(&del, del.br_startoff + rlen, + del.br_blockcount - rlen); + } + trace_xfs_reflink_cow_remap(ip, &del); - /* Map the new blocks into the data fork. */ - error = xfs_bmap_map_extent(tp->t_mountp, &dfops, - ip, &uirec); - if (error) - goto out_defer; + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(tp->t_mountp, &dfops, + del.br_startblock, del.br_blockcount); + if (error) + goto out_defer; - /* Remove the mapping from the CoW fork. */ - error = xfs_bunmapi_cow(ip, &uirec); - if (error) - goto out_defer; + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(tp->t_mountp, &dfops, ip, &del); + if (error) + goto out_defer; - error = xfs_defer_finish(&tp, &dfops, ip); - if (error) - goto out_defer; - } + /* Remove the mapping from the CoW fork. */ + xfs_bmap_del_extent_cow(ip, &idx, &got, &del); + + error = xfs_defer_finish(&tp, &dfops, ip); + if (error) + goto out_defer; next_extent: - /* Roll on... */ - offset_fsb = irec.br_startoff + ilen; + if (idx < 0) + break; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } error = xfs_trans_commit(tp); @@ -774,7 +737,6 @@ next_extent: out_defer: xfs_defer_cancel(&dfops); -out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out: @@ -1312,19 +1274,26 @@ out_error: */ int xfs_reflink_remap_range( - struct xfs_inode *src, - xfs_off_t srcoff, - struct xfs_inode *dest, - xfs_off_t destoff, - xfs_off_t len, - unsigned int flags) + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) { + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); struct xfs_mount *mp = src->i_mount; + loff_t bs = inode_out->i_sb->s_blocksize; + bool same_inode = (inode_in == inode_out); xfs_fileoff_t sfsbno, dfsbno; xfs_filblks_t fsblen; - int error; xfs_extlen_t cowextsize; - bool is_same; + loff_t isize; + ssize_t ret; + loff_t blen; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return -EOPNOTSUPP; @@ -1332,17 +1301,8 @@ xfs_reflink_remap_range( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - /* Don't reflink realtime inodes */ - if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) - return -EINVAL; - - if (flags & ~XFS_REFLINK_ALL) - return -EINVAL; - - trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff); - /* Lock both files against IO */ - if (src->i_ino == dest->i_ino) { + if (same_inode) { xfs_ilock(src, XFS_IOLOCK_EXCL); xfs_ilock(src, XFS_MMAPLOCK_EXCL); } else { @@ -1350,39 +1310,126 @@ xfs_reflink_remap_range( xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); } + /* Don't touch certain kinds of inodes */ + ret = -EPERM; + if (IS_IMMUTABLE(inode_out)) + goto out_unlock; + + ret = -ETXTBSY; + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + goto out_unlock; + + + /* Don't reflink dirs, pipes, sockets... */ + ret = -EISDIR; + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + goto out_unlock; + ret = -EINVAL; + if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) + goto out_unlock; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + goto out_unlock; + + /* Don't reflink realtime inodes */ + if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) + goto out_unlock; + + /* Don't share DAX file data for now. */ + if (IS_DAX(inode_in) || IS_DAX(inode_out)) + goto out_unlock; + + /* Are we going all the way to the end? */ + isize = i_size_read(inode_in); + if (isize == 0) { + ret = 0; + goto out_unlock; + } + + if (len == 0) + len = isize - pos_in; + + /* Ensure offsets don't wrap and the input is inside i_size */ + if (pos_in + len < pos_in || pos_out + len < pos_out || + pos_in + len > isize) + goto out_unlock; + + /* Don't allow dedupe past EOF in the dest file */ + if (is_dedupe) { + loff_t disize; + + disize = i_size_read(inode_out); + if (pos_out >= disize || pos_out + len > disize) + goto out_unlock; + } + + /* If we're linking to EOF, continue to the block boundary. */ + if (pos_in + len == isize) + blen = ALIGN(isize, bs) - pos_in; + else + blen = len; + + /* Only reflink if we're aligned to block boundaries */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) + goto out_unlock; + + /* Don't allow overlapped reflink within the same file */ + if (same_inode) { + if (pos_out + blen > pos_in && pos_out < pos_in + blen) + goto out_unlock; + } + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + len - 1); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + len - 1); + if (ret) + goto out_unlock; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + /* * Check that the extents are the same. */ - if (flags & XFS_REFLINK_DEDUPE) { - is_same = false; - error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest), - destoff, len, &is_same); - if (error) - goto out_error; + if (is_dedupe) { + bool is_same = false; + + ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out, + len, &is_same); + if (ret) + goto out_unlock; if (!is_same) { - error = -EBADE; - goto out_error; + ret = -EBADE; + goto out_unlock; } } - error = xfs_reflink_set_inode_flag(src, dest); - if (error) - goto out_error; + ret = xfs_reflink_set_inode_flag(src, dest); + if (ret) + goto out_unlock; /* * Invalidate the page cache so that we can clear any CoW mappings * in the destination file. */ - truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff, - PAGE_ALIGN(destoff + len) - 1); + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + len) - 1); - dfsbno = XFS_B_TO_FSBT(mp, destoff); - sfsbno = XFS_B_TO_FSBT(mp, srcoff); + dfsbno = XFS_B_TO_FSBT(mp, pos_out); + sfsbno = XFS_B_TO_FSBT(mp, pos_in); fsblen = XFS_B_TO_FSB(mp, len); - error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, - destoff + len); - if (error) - goto out_error; + ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, + pos_out + len); + if (ret) + goto out_unlock; /* * Carry the cowextsize hint from src to dest if we're sharing the @@ -1390,26 +1437,24 @@ xfs_reflink_remap_range( * has a cowextsize hint, and the destination file does not. */ cowextsize = 0; - if (srcoff == 0 && len == i_size_read(VFS_I(src)) && + if (pos_in == 0 && len == i_size_read(inode_in) && (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && - destoff == 0 && len >= i_size_read(VFS_I(dest)) && + pos_out == 0 && len >= i_size_read(inode_out) && !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) cowextsize = src->i_d.di_cowextsize; - error = xfs_reflink_update_dest(dest, destoff + len, cowextsize); - if (error) - goto out_error; + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize); -out_error: +out_unlock: xfs_iunlock(src, XFS_MMAPLOCK_EXCL); xfs_iunlock(src, XFS_IOLOCK_EXCL); if (src->i_ino != dest->i_ino) { xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); xfs_iunlock(dest, XFS_IOLOCK_EXCL); } - if (error) - trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_); - return error; + if (ret) + trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + return ret; } /* diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 5dc3c8ac12aa..fad11607c9ad 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -26,8 +26,8 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno, extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); -extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip, - xfs_off_t offset, xfs_off_t count); +extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, bool *shared); extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, @@ -43,11 +43,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); -#define XFS_REFLINK_DEDUPE 1 /* only reflink if contents match */ -#define XFS_REFLINK_ALL (XFS_REFLINK_DEDUPE) -extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff, - struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len, - unsigned int flags); +extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe); extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, struct xfs_trans **tpp); extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 5f8d55d29a11..276d3023d60f 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -512,13 +512,13 @@ static struct attribute *xfs_error_attrs[] = { }; -struct kobj_type xfs_error_cfg_ktype = { +static struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_attrs = xfs_error_attrs, }; -struct kobj_type xfs_error_ktype = { +static struct kobj_type xfs_error_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ad188d3a83f3..0907752be62d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3346,7 +3346,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); -DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range); +DEFINE_RW_EVENT(xfs_reflink_reserve_cow); DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); @@ -3356,9 +3356,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_piece); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_reserve_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); |