}
/**
- * drop_incomplete_group - drop nodes from an incomplete group.
+ * drop_last_node - drop the last node or group of nodes.
* @sleb: scanned LEB information
* @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
*
- * This function returns %1 if nodes are dropped and %0 otherwise.
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB or the last group of nodes if @grouped is not zero.
+ * This function returns %1 if a node was dropped and %0 otherwise.
*/
-static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
{
int dropped = 0;
kfree(snod);
sleb->nodes_cnt -= 1;
dropped = 1;
+ if (!grouped)
+ break;
}
return dropped;
}
struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
int offs, void *sbuf, int grouped)
{
- int ret = 0, err, len = c->leb_size - offs;
- int start = offs;
+ int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
struct ubifs_scan_leb *sleb;
void *buf = sbuf + offs;
if (IS_ERR(sleb))
return sleb;
+ ubifs_assert(len >= 8);
while (len >= 8) {
dbg_scan("look at LEB %d:%d (%d bytes left)",
lnum, offs, len);
}
}
- /* Drop nodes from incomplete group */
- if (grouped && drop_incomplete_group(sleb, &offs)) {
- buf = sbuf + offs;
- len = c->leb_size - offs;
- }
+ min_io_unit = round_down(offs, c->min_io_size);
+ if (grouped)
+ /*
+ * If nodes are grouped, always drop the incomplete group at
+ * the end.
+ */
+ drop_last_node(sleb, &offs, 1);
+
+ /*
+ * While we are in the middle of the same min. I/O unit keep dropping
+ * nodes. So basically, what we want is to make sure that the last min.
+ * I/O unit where we saw the corruption is dropped completely with all
+ * the uncorrupted node which may possibly sit there.
+ *
+ * In other words, let's name the min. I/O unit where the corruption
+ * starts B, and the previous min. I/O unit A. The below code tries to
+ * deal with a situation when half of B contains valid nodes or the end
+ * of a valid node, and the second half of B contains corrupted data or
+ * garbage. This means that UBIFS had been writing to B just before the
+ * power cut happened. I do not know how realistic is this scenario
+ * that half of the min. I/O unit had been written successfully and the
+ * other half not, but this is possible in our 'failure mode emulation'
+ * infrastructure at least.
+ *
+ * So what is the problem, why we need to drop those nodes? Whey can't
+ * we just clean-up the second half of B by putting a padding node
+ * there? We can, and this works fine with one exception which was
+ * reproduced with power cut emulation testing and happens extremely
+ * rarely. The description follows, but it is worth noting that that is
+ * only about the GC head, so we could do this trick only if the bud
+ * belongs to the GC head, but it does not seem to be worth an
+ * additional "if" statement.
+ *
+ * So, imagine the file-system is full, we run GC which is moving valid
+ * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
+ * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
+ * and will try to continue. Imagine that LEB X is currently the
+ * dirtiest LEB, and the amount of used space in LEB Y is exactly the
+ * same as amount of free space in LEB X.
+ *
+ * And a power cut happens when nodes are moved from LEB X to LEB Y. We
+ * are here trying to recover LEB Y which is the GC head LEB. We find
+ * the min. I/O unit B as described above. Then we clean-up LEB Y by
+ * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
+ * fails, because it cannot find a dirty LEB which could be GC'd into
+ * LEB Y! Even LEB X does not match because the amount of valid nodes
+ * there does not fit the free space in LEB Y any more! And this is
+ * because of the padding node which we added to LEB Y. The
+ * user-visible effect of this which I once observed and analysed is
+ * that we cannot mount the file-system with -ENOSPC error.
+ *
+ * So obviously, to make sure that situation does not happen we should
+ * free min. I/O unit B in LEB Y completely and the last used min. I/O
+ * unit in LEB Y should be A. This is basically what the below code
+ * tries to do.
+ */
+ while (min_io_unit == round_down(offs, c->min_io_size) &&
+ min_io_unit != offs &&
+ drop_last_node(sleb, &offs, grouped));
+
+ buf = sbuf + offs;
+ len = c->leb_size - offs;
clean_buf(c, &buf, lnum, &offs, &len);
ubifs_end_scan(c, sleb, lnum, offs);