1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3 * Copyright (C) 2018 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans.h"
15 #include "xfs_sb.h"
16 #include "xfs_inode.h"
17 #include "xfs_alloc.h"
18 #include "xfs_alloc_btree.h"
19 #include "xfs_ialloc.h"
20 #include "xfs_ialloc_btree.h"
21 #include "xfs_rmap.h"
22 #include "xfs_rmap_btree.h"
23 #include "xfs_refcount_btree.h"
24 #include "xfs_extent_busy.h"
25 #include "xfs_ag.h"
26 #include "xfs_ag_resv.h"
27 #include "xfs_quota.h"
28 #include "xfs_qm.h"
29 #include "scrub/scrub.h"
30 #include "scrub/common.h"
31 #include "scrub/trace.h"
32 #include "scrub/repair.h"
33 #include "scrub/bitmap.h"
34
35 /*
36 * Attempt to repair some metadata, if the metadata is corrupt and userspace
37 * told us to fix it. This function returns -EAGAIN to mean "re-run scrub",
38 * and will set *fixed to true if it thinks it repaired anything.
39 */
40 int
xrep_attempt(struct xfs_scrub * sc)41 xrep_attempt(
42 struct xfs_scrub *sc)
43 {
44 int error = 0;
45
46 trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
47
48 xchk_ag_btcur_free(&sc->sa);
49
50 /* Repair whatever's broken. */
51 ASSERT(sc->ops->repair);
52 error = sc->ops->repair(sc);
53 trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
54 switch (error) {
55 case 0:
56 /*
57 * Repair succeeded. Commit the fixes and perform a second
58 * scrub so that we can tell userspace if we fixed the problem.
59 */
60 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
61 sc->flags |= XREP_ALREADY_FIXED;
62 return -EAGAIN;
63 case -EDEADLOCK:
64 /* Tell the caller to try again having grabbed all the locks. */
65 if (!(sc->flags & XCHK_TRY_HARDER)) {
66 sc->flags |= XCHK_TRY_HARDER;
67 return -EAGAIN;
68 }
69 /*
70 * We tried harder but still couldn't grab all the resources
71 * we needed to fix it. The corruption has not been fixed,
72 * so exit to userspace with the scan's output flags unchanged.
73 */
74 return 0;
75 default:
76 /*
77 * EAGAIN tells the caller to re-scrub, so we cannot return
78 * that here.
79 */
80 ASSERT(error != -EAGAIN);
81 return error;
82 }
83 }
84
85 /*
86 * Complain about unfixable problems in the filesystem. We don't log
87 * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
88 * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
89 * administrator isn't running xfs_scrub in no-repairs mode.
90 *
91 * Use this helper function because _ratelimited silently declares a static
92 * structure to track rate limiting information.
93 */
94 void
xrep_failure(struct xfs_mount * mp)95 xrep_failure(
96 struct xfs_mount *mp)
97 {
98 xfs_alert_ratelimited(mp,
99 "Corruption not fixed during online repair. Unmount and run xfs_repair.");
100 }
101
102 /*
103 * Repair probe -- userspace uses this to probe if we're willing to repair a
104 * given mountpoint.
105 */
106 int
xrep_probe(struct xfs_scrub * sc)107 xrep_probe(
108 struct xfs_scrub *sc)
109 {
110 int error = 0;
111
112 if (xchk_should_terminate(sc, &error))
113 return error;
114
115 return 0;
116 }
117
118 /*
119 * Roll a transaction, keeping the AG headers locked and reinitializing
120 * the btree cursors.
121 */
122 int
xrep_roll_ag_trans(struct xfs_scrub * sc)123 xrep_roll_ag_trans(
124 struct xfs_scrub *sc)
125 {
126 int error;
127
128 /*
129 * Keep the AG header buffers locked while we roll the transaction.
130 * Ensure that both AG buffers are dirty and held when we roll the
131 * transaction so that they move forward in the log without losing the
132 * bli (and hence the bli type) when the transaction commits.
133 *
134 * Normal code would never hold clean buffers across a roll, but repair
135 * needs both buffers to maintain a total lock on the AG.
136 */
137 if (sc->sa.agi_bp) {
138 xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
139 xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
140 }
141
142 if (sc->sa.agf_bp) {
143 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
144 xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
145 }
146
147 /*
148 * Roll the transaction. We still hold the AG header buffers locked
149 * regardless of whether or not that succeeds. On failure, the buffers
150 * will be released during teardown on our way out of the kernel. If
151 * successful, join the buffers to the new transaction and move on.
152 */
153 error = xfs_trans_roll(&sc->tp);
154 if (error)
155 return error;
156
157 /* Join the AG headers to the new transaction. */
158 if (sc->sa.agi_bp)
159 xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
160 if (sc->sa.agf_bp)
161 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
162
163 return 0;
164 }
165
166 /*
167 * Does the given AG have enough space to rebuild a btree? Neither AG
168 * reservation can be critical, and we must have enough space (factoring
169 * in AG reservations) to construct a whole btree.
170 */
171 bool
xrep_ag_has_space(struct xfs_perag * pag,xfs_extlen_t nr_blocks,enum xfs_ag_resv_type type)172 xrep_ag_has_space(
173 struct xfs_perag *pag,
174 xfs_extlen_t nr_blocks,
175 enum xfs_ag_resv_type type)
176 {
177 return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
178 !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
179 pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
180 }
181
182 /*
183 * Figure out how many blocks to reserve for an AG repair. We calculate the
184 * worst case estimate for the number of blocks we'd need to rebuild one of
185 * any type of per-AG btree.
186 */
187 xfs_extlen_t
xrep_calc_ag_resblks(struct xfs_scrub * sc)188 xrep_calc_ag_resblks(
189 struct xfs_scrub *sc)
190 {
191 struct xfs_mount *mp = sc->mp;
192 struct xfs_scrub_metadata *sm = sc->sm;
193 struct xfs_perag *pag;
194 struct xfs_buf *bp;
195 xfs_agino_t icount = NULLAGINO;
196 xfs_extlen_t aglen = NULLAGBLOCK;
197 xfs_extlen_t usedlen;
198 xfs_extlen_t freelen;
199 xfs_extlen_t bnobt_sz;
200 xfs_extlen_t inobt_sz;
201 xfs_extlen_t rmapbt_sz;
202 xfs_extlen_t refcbt_sz;
203 int error;
204
205 if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
206 return 0;
207
208 pag = xfs_perag_get(mp, sm->sm_agno);
209 if (xfs_perag_initialised_agi(pag)) {
210 /* Use in-core icount if possible. */
211 icount = pag->pagi_count;
212 } else {
213 /* Try to get the actual counters from disk. */
214 error = xfs_ialloc_read_agi(pag, NULL, &bp);
215 if (!error) {
216 icount = pag->pagi_count;
217 xfs_buf_relse(bp);
218 }
219 }
220
221 /* Now grab the block counters from the AGF. */
222 error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
223 if (error) {
224 aglen = pag->block_count;
225 freelen = aglen;
226 usedlen = aglen;
227 } else {
228 struct xfs_agf *agf = bp->b_addr;
229
230 aglen = be32_to_cpu(agf->agf_length);
231 freelen = be32_to_cpu(agf->agf_freeblks);
232 usedlen = aglen - freelen;
233 xfs_buf_relse(bp);
234 }
235
236 /* If the icount is impossible, make some worst-case assumptions. */
237 if (icount == NULLAGINO ||
238 !xfs_verify_agino(pag, icount)) {
239 icount = pag->agino_max - pag->agino_min + 1;
240 }
241
242 /* If the block counts are impossible, make worst-case assumptions. */
243 if (aglen == NULLAGBLOCK ||
244 aglen != pag->block_count ||
245 freelen >= aglen) {
246 aglen = pag->block_count;
247 freelen = aglen;
248 usedlen = aglen;
249 }
250 xfs_perag_put(pag);
251
252 trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
253 freelen, usedlen);
254
255 /*
256 * Figure out how many blocks we'd need worst case to rebuild
257 * each type of btree. Note that we can only rebuild the
258 * bnobt/cntbt or inobt/finobt as pairs.
259 */
260 bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
261 if (xfs_has_sparseinodes(mp))
262 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
263 XFS_INODES_PER_HOLEMASK_BIT);
264 else
265 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
266 XFS_INODES_PER_CHUNK);
267 if (xfs_has_finobt(mp))
268 inobt_sz *= 2;
269 if (xfs_has_reflink(mp))
270 refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
271 else
272 refcbt_sz = 0;
273 if (xfs_has_rmapbt(mp)) {
274 /*
275 * Guess how many blocks we need to rebuild the rmapbt.
276 * For non-reflink filesystems we can't have more records than
277 * used blocks. However, with reflink it's possible to have
278 * more than one rmap record per AG block. We don't know how
279 * many rmaps there could be in the AG, so we start off with
280 * what we hope is an generous over-estimation.
281 */
282 if (xfs_has_reflink(mp))
283 rmapbt_sz = xfs_rmapbt_calc_size(mp,
284 (unsigned long long)aglen * 2);
285 else
286 rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
287 } else {
288 rmapbt_sz = 0;
289 }
290
291 trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
292 inobt_sz, rmapbt_sz, refcbt_sz);
293
294 return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
295 }
296
297 /* Allocate a block in an AG. */
298 int
xrep_alloc_ag_block(struct xfs_scrub * sc,const struct xfs_owner_info * oinfo,xfs_fsblock_t * fsbno,enum xfs_ag_resv_type resv)299 xrep_alloc_ag_block(
300 struct xfs_scrub *sc,
301 const struct xfs_owner_info *oinfo,
302 xfs_fsblock_t *fsbno,
303 enum xfs_ag_resv_type resv)
304 {
305 struct xfs_alloc_arg args = {0};
306 xfs_agblock_t bno;
307 int error;
308
309 switch (resv) {
310 case XFS_AG_RESV_AGFL:
311 case XFS_AG_RESV_RMAPBT:
312 error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp,
313 sc->sa.agf_bp, &bno, 1);
314 if (error)
315 return error;
316 if (bno == NULLAGBLOCK)
317 return -ENOSPC;
318 xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false);
319 *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
320 if (resv == XFS_AG_RESV_RMAPBT)
321 xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
322 return 0;
323 default:
324 break;
325 }
326
327 args.tp = sc->tp;
328 args.mp = sc->mp;
329 args.pag = sc->sa.pag;
330 args.oinfo = *oinfo;
331 args.minlen = 1;
332 args.maxlen = 1;
333 args.prod = 1;
334 args.resv = resv;
335
336 error = xfs_alloc_vextent_this_ag(&args, sc->sa.pag->pag_agno);
337 if (error)
338 return error;
339 if (args.fsbno == NULLFSBLOCK)
340 return -ENOSPC;
341 ASSERT(args.len == 1);
342 *fsbno = args.fsbno;
343
344 return 0;
345 }
346
347 /* Initialize a new AG btree root block with zero entries. */
348 int
xrep_init_btblock(struct xfs_scrub * sc,xfs_fsblock_t fsb,struct xfs_buf ** bpp,xfs_btnum_t btnum,const struct xfs_buf_ops * ops)349 xrep_init_btblock(
350 struct xfs_scrub *sc,
351 xfs_fsblock_t fsb,
352 struct xfs_buf **bpp,
353 xfs_btnum_t btnum,
354 const struct xfs_buf_ops *ops)
355 {
356 struct xfs_trans *tp = sc->tp;
357 struct xfs_mount *mp = sc->mp;
358 struct xfs_buf *bp;
359 int error;
360
361 trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
362 XFS_FSB_TO_AGBNO(mp, fsb), btnum);
363
364 ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno);
365 error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
366 XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
367 &bp);
368 if (error)
369 return error;
370 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
371 xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno);
372 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
373 xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
374 bp->b_ops = ops;
375 *bpp = bp;
376
377 return 0;
378 }
379
380 /*
381 * Reconstructing per-AG Btrees
382 *
383 * When a space btree is corrupt, we don't bother trying to fix it. Instead,
384 * we scan secondary space metadata to derive the records that should be in
385 * the damaged btree, initialize a fresh btree root, and insert the records.
386 * Note that for rebuilding the rmapbt we scan all the primary data to
387 * generate the new records.
388 *
389 * However, that leaves the matter of removing all the metadata describing the
390 * old broken structure. For primary metadata we use the rmap data to collect
391 * every extent with a matching rmap owner (bitmap); we then iterate all other
392 * metadata structures with the same rmap owner to collect the extents that
393 * cannot be removed (sublist). We then subtract sublist from bitmap to
394 * derive the blocks that were used by the old btree. These blocks can be
395 * reaped.
396 *
397 * For rmapbt reconstructions we must use different tactics for extent
398 * collection. First we iterate all primary metadata (this excludes the old
399 * rmapbt, obviously) to generate new rmap records. The gaps in the rmap
400 * records are collected as bitmap. The bnobt records are collected as
401 * sublist. As with the other btrees we subtract sublist from bitmap, and the
402 * result (since the rmapbt lives in the free space) are the blocks from the
403 * old rmapbt.
404 *
405 * Disposal of Blocks from Old per-AG Btrees
406 *
407 * Now that we've constructed a new btree to replace the damaged one, we want
408 * to dispose of the blocks that (we think) the old btree was using.
409 * Previously, we used the rmapbt to collect the extents (bitmap) with the
410 * rmap owner corresponding to the tree we rebuilt, collected extents for any
411 * blocks with the same rmap owner that are owned by another data structure
412 * (sublist), and subtracted sublist from bitmap. In theory the extents
413 * remaining in bitmap are the old btree's blocks.
414 *
415 * Unfortunately, it's possible that the btree was crosslinked with other
416 * blocks on disk. The rmap data can tell us if there are multiple owners, so
417 * if the rmapbt says there is an owner of this block other than @oinfo, then
418 * the block is crosslinked. Remove the reverse mapping and continue.
419 *
420 * If there is one rmap record, we can free the block, which removes the
421 * reverse mapping but doesn't add the block to the free space. Our repair
422 * strategy is to hope the other metadata objects crosslinked on this block
423 * will be rebuilt (atop different blocks), thereby removing all the cross
424 * links.
425 *
426 * If there are no rmap records at all, we also free the block. If the btree
427 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
428 * supposed to be a rmap record and everything is ok. For other btrees there
429 * had to have been an rmap entry for the block to have ended up on @bitmap,
430 * so if it's gone now there's something wrong and the fs will shut down.
431 *
432 * Note: If there are multiple rmap records with only the same rmap owner as
433 * the btree we're trying to rebuild and the block is indeed owned by another
434 * data structure with the same rmap owner, then the block will be in sublist
435 * and therefore doesn't need disposal. If there are multiple rmap records
436 * with only the same rmap owner but the block is not owned by something with
437 * the same rmap owner, the block will be freed.
438 *
439 * The caller is responsible for locking the AG headers for the entire rebuild
440 * operation so that nothing else can sneak in and change the AG state while
441 * we're not looking. We also assume that the caller already invalidated any
442 * buffers associated with @bitmap.
443 */
444
445 /*
446 * Invalidate buffers for per-AG btree blocks we're dumping. This function
447 * is not intended for use with file data repairs; we have bunmapi for that.
448 */
449 int
xrep_invalidate_blocks(struct xfs_scrub * sc,struct xbitmap * bitmap)450 xrep_invalidate_blocks(
451 struct xfs_scrub *sc,
452 struct xbitmap *bitmap)
453 {
454 struct xbitmap_range *bmr;
455 struct xbitmap_range *n;
456 struct xfs_buf *bp;
457 xfs_fsblock_t fsbno;
458
459 /*
460 * For each block in each extent, see if there's an incore buffer for
461 * exactly that block; if so, invalidate it. The buffer cache only
462 * lets us look for one buffer at a time, so we have to look one block
463 * at a time. Avoid invalidating AG headers and post-EOFS blocks
464 * because we never own those; and if we can't TRYLOCK the buffer we
465 * assume it's owned by someone else.
466 */
467 for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
468 int error;
469
470 /* Skip AG headers and post-EOFS blocks */
471 if (!xfs_verify_fsbno(sc->mp, fsbno))
472 continue;
473 error = xfs_buf_incore(sc->mp->m_ddev_targp,
474 XFS_FSB_TO_DADDR(sc->mp, fsbno),
475 XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
476 if (error)
477 continue;
478
479 xfs_trans_bjoin(sc->tp, bp);
480 xfs_trans_binval(sc->tp, bp);
481 }
482
483 return 0;
484 }
485
486 /* Ensure the freelist is the correct size. */
487 int
xrep_fix_freelist(struct xfs_scrub * sc,bool can_shrink)488 xrep_fix_freelist(
489 struct xfs_scrub *sc,
490 bool can_shrink)
491 {
492 struct xfs_alloc_arg args = {0};
493
494 args.mp = sc->mp;
495 args.tp = sc->tp;
496 args.agno = sc->sa.pag->pag_agno;
497 args.alignment = 1;
498 args.pag = sc->sa.pag;
499
500 return xfs_alloc_fix_freelist(&args,
501 can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
502 }
503
504 /*
505 * Put a block back on the AGFL.
506 */
507 STATIC int
xrep_put_freelist(struct xfs_scrub * sc,xfs_agblock_t agbno)508 xrep_put_freelist(
509 struct xfs_scrub *sc,
510 xfs_agblock_t agbno)
511 {
512 struct xfs_buf *agfl_bp;
513 int error;
514
515 /* Make sure there's space on the freelist. */
516 error = xrep_fix_freelist(sc, true);
517 if (error)
518 return error;
519
520 /*
521 * Since we're "freeing" a lost block onto the AGFL, we have to
522 * create an rmap for the block prior to merging it or else other
523 * parts will break.
524 */
525 error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
526 &XFS_RMAP_OINFO_AG);
527 if (error)
528 return error;
529
530 /* Put the block on the AGFL. */
531 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
532 if (error)
533 return error;
534
535 error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
536 agfl_bp, agbno, 0);
537 if (error)
538 return error;
539 xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
540 XFS_EXTENT_BUSY_SKIP_DISCARD);
541
542 return 0;
543 }
544
545 /* Dispose of a single block. */
546 STATIC int
xrep_reap_block(struct xfs_scrub * sc,xfs_fsblock_t fsbno,const struct xfs_owner_info * oinfo,enum xfs_ag_resv_type resv)547 xrep_reap_block(
548 struct xfs_scrub *sc,
549 xfs_fsblock_t fsbno,
550 const struct xfs_owner_info *oinfo,
551 enum xfs_ag_resv_type resv)
552 {
553 struct xfs_btree_cur *cur;
554 struct xfs_buf *agf_bp = NULL;
555 xfs_agblock_t agbno;
556 bool has_other_rmap;
557 int error;
558
559 agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
560 ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
561
562 /*
563 * If we are repairing per-inode metadata, we need to read in the AGF
564 * buffer. Otherwise, we're repairing a per-AG structure, so reuse
565 * the AGF buffer that the setup functions already grabbed.
566 */
567 if (sc->ip) {
568 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
569 if (error)
570 return error;
571 } else {
572 agf_bp = sc->sa.agf_bp;
573 }
574 cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag);
575
576 /* Can we find any other rmappings? */
577 error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
578 xfs_btree_del_cursor(cur, error);
579 if (error)
580 goto out_free;
581
582 /*
583 * If there are other rmappings, this block is cross linked and must
584 * not be freed. Remove the reverse mapping and move on. Otherwise,
585 * we were the only owner of the block, so free the extent, which will
586 * also remove the rmap.
587 *
588 * XXX: XFS doesn't support detecting the case where a single block
589 * metadata structure is crosslinked with a multi-block structure
590 * because the buffer cache doesn't detect aliasing problems, so we
591 * can't fix 100% of crosslinking problems (yet). The verifiers will
592 * blow on writeout, the filesystem will shut down, and the admin gets
593 * to run xfs_repair.
594 */
595 if (has_other_rmap)
596 error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno,
597 1, oinfo);
598 else if (resv == XFS_AG_RESV_AGFL)
599 error = xrep_put_freelist(sc, agbno);
600 else
601 error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
602 if (agf_bp != sc->sa.agf_bp)
603 xfs_trans_brelse(sc->tp, agf_bp);
604 if (error)
605 return error;
606
607 if (sc->ip)
608 return xfs_trans_roll_inode(&sc->tp, sc->ip);
609 return xrep_roll_ag_trans(sc);
610
611 out_free:
612 if (agf_bp != sc->sa.agf_bp)
613 xfs_trans_brelse(sc->tp, agf_bp);
614 return error;
615 }
616
617 /* Dispose of every block of every extent in the bitmap. */
618 int
xrep_reap_extents(struct xfs_scrub * sc,struct xbitmap * bitmap,const struct xfs_owner_info * oinfo,enum xfs_ag_resv_type type)619 xrep_reap_extents(
620 struct xfs_scrub *sc,
621 struct xbitmap *bitmap,
622 const struct xfs_owner_info *oinfo,
623 enum xfs_ag_resv_type type)
624 {
625 struct xbitmap_range *bmr;
626 struct xbitmap_range *n;
627 xfs_fsblock_t fsbno;
628 int error = 0;
629
630 ASSERT(xfs_has_rmapbt(sc->mp));
631
632 for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
633 ASSERT(sc->ip != NULL ||
634 XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
635 trace_xrep_dispose_btree_extent(sc->mp,
636 XFS_FSB_TO_AGNO(sc->mp, fsbno),
637 XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
638
639 error = xrep_reap_block(sc, fsbno, oinfo, type);
640 if (error)
641 break;
642 }
643
644 return error;
645 }
646
647 /*
648 * Finding per-AG Btree Roots for AGF/AGI Reconstruction
649 *
650 * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
651 * the AG headers by using the rmap data to rummage through the AG looking for
652 * btree roots. This is not guaranteed to work if the AG is heavily damaged
653 * or the rmap data are corrupt.
654 *
655 * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL
656 * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
657 * AGI is being rebuilt. It must maintain these locks until it's safe for
658 * other threads to change the btrees' shapes. The caller provides
659 * information about the btrees to look for by passing in an array of
660 * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
661 * The (root, height) fields will be set on return if anything is found. The
662 * last element of the array should have a NULL buf_ops to mark the end of the
663 * array.
664 *
665 * For every rmapbt record matching any of the rmap owners in btree_info,
666 * read each block referenced by the rmap record. If the block is a btree
667 * block from this filesystem matching any of the magic numbers and has a
668 * level higher than what we've already seen, remember the block and the
669 * height of the tree required to have such a block. When the call completes,
670 * we return the highest block we've found for each btree description; those
671 * should be the roots.
672 */
673
674 struct xrep_findroot {
675 struct xfs_scrub *sc;
676 struct xfs_buf *agfl_bp;
677 struct xfs_agf *agf;
678 struct xrep_find_ag_btree *btree_info;
679 };
680
681 /* See if our block is in the AGFL. */
682 STATIC int
xrep_findroot_agfl_walk(struct xfs_mount * mp,xfs_agblock_t bno,void * priv)683 xrep_findroot_agfl_walk(
684 struct xfs_mount *mp,
685 xfs_agblock_t bno,
686 void *priv)
687 {
688 xfs_agblock_t *agbno = priv;
689
690 return (*agbno == bno) ? -ECANCELED : 0;
691 }
692
693 /* Does this block match the btree information passed in? */
694 STATIC int
xrep_findroot_block(struct xrep_findroot * ri,struct xrep_find_ag_btree * fab,uint64_t owner,xfs_agblock_t agbno,bool * done_with_block)695 xrep_findroot_block(
696 struct xrep_findroot *ri,
697 struct xrep_find_ag_btree *fab,
698 uint64_t owner,
699 xfs_agblock_t agbno,
700 bool *done_with_block)
701 {
702 struct xfs_mount *mp = ri->sc->mp;
703 struct xfs_buf *bp;
704 struct xfs_btree_block *btblock;
705 xfs_daddr_t daddr;
706 int block_level;
707 int error = 0;
708
709 daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
710
711 /*
712 * Blocks in the AGFL have stale contents that might just happen to
713 * have a matching magic and uuid. We don't want to pull these blocks
714 * in as part of a tree root, so we have to filter out the AGFL stuff
715 * here. If the AGFL looks insane we'll just refuse to repair.
716 */
717 if (owner == XFS_RMAP_OWN_AG) {
718 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
719 xrep_findroot_agfl_walk, &agbno);
720 if (error == -ECANCELED)
721 return 0;
722 if (error)
723 return error;
724 }
725
726 /*
727 * Read the buffer into memory so that we can see if it's a match for
728 * our btree type. We have no clue if it is beforehand, and we want to
729 * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
730 * will cause needless disk reads in subsequent calls to this function)
731 * and logging metadata verifier failures.
732 *
733 * Therefore, pass in NULL buffer ops. If the buffer was already in
734 * memory from some other caller it will already have b_ops assigned.
735 * If it was in memory from a previous unsuccessful findroot_block
736 * call, the buffer won't have b_ops but it should be clean and ready
737 * for us to try to verify if the read call succeeds. The same applies
738 * if the buffer wasn't in memory at all.
739 *
740 * Note: If we never match a btree type with this buffer, it will be
741 * left in memory with NULL b_ops. This shouldn't be a problem unless
742 * the buffer gets written.
743 */
744 error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
745 mp->m_bsize, 0, &bp, NULL);
746 if (error)
747 return error;
748
749 /* Ensure the block magic matches the btree type we're looking for. */
750 btblock = XFS_BUF_TO_BLOCK(bp);
751 ASSERT(fab->buf_ops->magic[1] != 0);
752 if (btblock->bb_magic != fab->buf_ops->magic[1])
753 goto out;
754
755 /*
756 * If the buffer already has ops applied and they're not the ones for
757 * this btree type, we know this block doesn't match the btree and we
758 * can bail out.
759 *
760 * If the buffer ops match ours, someone else has already validated
761 * the block for us, so we can move on to checking if this is a root
762 * block candidate.
763 *
764 * If the buffer does not have ops, nobody has successfully validated
765 * the contents and the buffer cannot be dirty. If the magic, uuid,
766 * and structure match this btree type then we'll move on to checking
767 * if it's a root block candidate. If there is no match, bail out.
768 */
769 if (bp->b_ops) {
770 if (bp->b_ops != fab->buf_ops)
771 goto out;
772 } else {
773 ASSERT(!xfs_trans_buf_is_dirty(bp));
774 if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
775 &mp->m_sb.sb_meta_uuid))
776 goto out;
777 /*
778 * Read verifiers can reference b_ops, so we set the pointer
779 * here. If the verifier fails we'll reset the buffer state
780 * to what it was before we touched the buffer.
781 */
782 bp->b_ops = fab->buf_ops;
783 fab->buf_ops->verify_read(bp);
784 if (bp->b_error) {
785 bp->b_ops = NULL;
786 bp->b_error = 0;
787 goto out;
788 }
789
790 /*
791 * Some read verifiers will (re)set b_ops, so we must be
792 * careful not to change b_ops after running the verifier.
793 */
794 }
795
796 /*
797 * This block passes the magic/uuid and verifier tests for this btree
798 * type. We don't need the caller to try the other tree types.
799 */
800 *done_with_block = true;
801
802 /*
803 * Compare this btree block's level to the height of the current
804 * candidate root block.
805 *
806 * If the level matches the root we found previously, throw away both
807 * blocks because there can't be two candidate roots.
808 *
809 * If level is lower in the tree than the root we found previously,
810 * ignore this block.
811 */
812 block_level = xfs_btree_get_level(btblock);
813 if (block_level + 1 == fab->height) {
814 fab->root = NULLAGBLOCK;
815 goto out;
816 } else if (block_level < fab->height) {
817 goto out;
818 }
819
820 /*
821 * This is the highest block in the tree that we've found so far.
822 * Update the btree height to reflect what we've learned from this
823 * block.
824 */
825 fab->height = block_level + 1;
826
827 /*
828 * If this block doesn't have sibling pointers, then it's the new root
829 * block candidate. Otherwise, the root will be found farther up the
830 * tree.
831 */
832 if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
833 btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
834 fab->root = agbno;
835 else
836 fab->root = NULLAGBLOCK;
837
838 trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
839 be32_to_cpu(btblock->bb_magic), fab->height - 1);
840 out:
841 xfs_trans_brelse(ri->sc->tp, bp);
842 return error;
843 }
844
845 /*
846 * Do any of the blocks in this rmap record match one of the btrees we're
847 * looking for?
848 */
849 STATIC int
xrep_findroot_rmap(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)850 xrep_findroot_rmap(
851 struct xfs_btree_cur *cur,
852 const struct xfs_rmap_irec *rec,
853 void *priv)
854 {
855 struct xrep_findroot *ri = priv;
856 struct xrep_find_ag_btree *fab;
857 xfs_agblock_t b;
858 bool done;
859 int error = 0;
860
861 /* Ignore anything that isn't AG metadata. */
862 if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
863 return 0;
864
865 /* Otherwise scan each block + btree type. */
866 for (b = 0; b < rec->rm_blockcount; b++) {
867 done = false;
868 for (fab = ri->btree_info; fab->buf_ops; fab++) {
869 if (rec->rm_owner != fab->rmap_owner)
870 continue;
871 error = xrep_findroot_block(ri, fab,
872 rec->rm_owner, rec->rm_startblock + b,
873 &done);
874 if (error)
875 return error;
876 if (done)
877 break;
878 }
879 }
880
881 return 0;
882 }
883
884 /* Find the roots of the per-AG btrees described in btree_info. */
885 int
xrep_find_ag_btree_roots(struct xfs_scrub * sc,struct xfs_buf * agf_bp,struct xrep_find_ag_btree * btree_info,struct xfs_buf * agfl_bp)886 xrep_find_ag_btree_roots(
887 struct xfs_scrub *sc,
888 struct xfs_buf *agf_bp,
889 struct xrep_find_ag_btree *btree_info,
890 struct xfs_buf *agfl_bp)
891 {
892 struct xfs_mount *mp = sc->mp;
893 struct xrep_findroot ri;
894 struct xrep_find_ag_btree *fab;
895 struct xfs_btree_cur *cur;
896 int error;
897
898 ASSERT(xfs_buf_islocked(agf_bp));
899 ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
900
901 ri.sc = sc;
902 ri.btree_info = btree_info;
903 ri.agf = agf_bp->b_addr;
904 ri.agfl_bp = agfl_bp;
905 for (fab = btree_info; fab->buf_ops; fab++) {
906 ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
907 ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
908 fab->root = NULLAGBLOCK;
909 fab->height = 0;
910 }
911
912 cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
913 error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
914 xfs_btree_del_cursor(cur, error);
915
916 return error;
917 }
918
919 /* Force a quotacheck the next time we mount. */
920 void
xrep_force_quotacheck(struct xfs_scrub * sc,xfs_dqtype_t type)921 xrep_force_quotacheck(
922 struct xfs_scrub *sc,
923 xfs_dqtype_t type)
924 {
925 uint flag;
926
927 flag = xfs_quota_chkd_flag(type);
928 if (!(flag & sc->mp->m_qflags))
929 return;
930
931 mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
932 sc->mp->m_qflags &= ~flag;
933 spin_lock(&sc->mp->m_sb_lock);
934 sc->mp->m_sb.sb_qflags &= ~flag;
935 spin_unlock(&sc->mp->m_sb_lock);
936 xfs_log_sb(sc->tp);
937 mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
938 }
939
940 /*
941 * Attach dquots to this inode, or schedule quotacheck to fix them.
942 *
943 * This function ensures that the appropriate dquots are attached to an inode.
944 * We cannot allow the dquot code to allocate an on-disk dquot block here
945 * because we're already in transaction context with the inode locked. The
946 * on-disk dquot should already exist anyway. If the quota code signals
947 * corruption or missing quota information, schedule quotacheck, which will
948 * repair corruptions in the quota metadata.
949 */
950 int
xrep_ino_dqattach(struct xfs_scrub * sc)951 xrep_ino_dqattach(
952 struct xfs_scrub *sc)
953 {
954 int error;
955
956 error = xfs_qm_dqattach_locked(sc->ip, false);
957 switch (error) {
958 case -EFSBADCRC:
959 case -EFSCORRUPTED:
960 case -ENOENT:
961 xfs_err_ratelimited(sc->mp,
962 "inode %llu repair encountered quota error %d, quotacheck forced.",
963 (unsigned long long)sc->ip->i_ino, error);
964 if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
965 xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
966 if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
967 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
968 if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
969 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
970 fallthrough;
971 case -ESRCH:
972 error = 0;
973 break;
974 default:
975 break;
976 }
977
978 return error;
979 }
980