fs/xfs/xfs_inode.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include <linux/log2.h>
  19
  20 #include "xfs.h"
  21 #include "xfs_fs.h"
  22 #include "xfs_types.h"
  23 #include "xfs_bit.h"
  24 #include "xfs_log.h"
  25 #include "xfs_inum.h"
  26 #include "xfs_trans.h"
  27 #include "xfs_trans_priv.h"
  28 #include "xfs_sb.h"
  29 #include "xfs_ag.h"
  30 #include "xfs_mount.h"
  31 #include "xfs_bmap_btree.h"
  32 #include "xfs_alloc_btree.h"
  33 #include "xfs_ialloc_btree.h"
  34 #include "xfs_attr_sf.h"
  35 #include "xfs_dinode.h"
  36 #include "xfs_inode.h"
  37 #include "xfs_buf_item.h"
  38 #include "xfs_inode_item.h"
  39 #include "xfs_btree.h"
  40 #include "xfs_alloc.h"
  41 #include "xfs_ialloc.h"
  42 #include "xfs_bmap.h"
  43 #include "xfs_error.h"
  44 #include "xfs_utils.h"
  45 #include "xfs_quota.h"
  46 #include "xfs_filestream.h"
  47 #include "xfs_vnodeops.h"
  48 #include "xfs_trace.h"
  49
  50 kmem_zone_t *xfs_ifork_zone;
  51 kmem_zone_t *xfs_inode_zone;
  52
  53 /*
  54  * Used in xfs_itruncate_extents().  This is the maximum number of extents
  55  * freed from a file in a single transaction.
  56  */
  57 #define XFS_ITRUNC_MAX_EXTENTS  2
  58
  59 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
  60 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
  61 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
  62 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
  63
  64 #ifdef DEBUG
  65 /*
  66  * Make sure that the extents in the given memory buffer
  67  * are valid.
  68  */
  69 STATIC void
  70 xfs_validate_extents(
  71         xfs_ifork_t             *ifp,
  72         int                     nrecs,
  73         xfs_exntfmt_t           fmt)
  74 {
  75         xfs_bmbt_irec_t         irec;
  76         xfs_bmbt_rec_host_t     rec;
  77         int                     i;
  78
  79         for (i = 0; i < nrecs; i++) {
  80                 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
  81                 rec.l0 = get_unaligned(&ep->l0);
  82                 rec.l1 = get_unaligned(&ep->l1);
  83                 xfs_bmbt_get_all(&rec, &irec);
  84                 if (fmt == XFS_EXTFMT_NOSTATE)
  85                         ASSERT(irec.br_state == XFS_EXT_NORM);
  86         }
  87 }
  88 #else /* DEBUG */
  89 #define xfs_validate_extents(ifp, nrecs, fmt)
  90 #endif /* DEBUG */
  91
  92 /*
  93  * Check that none of the inode's in the buffer have a next
  94  * unlinked field of 0.
  95  */
  96 #if defined(DEBUG)
  97 void
  98 xfs_inobp_check(
  99         xfs_mount_t     *mp,
 100         xfs_buf_t       *bp)
 101 {
 102         int             i;
 103         int             j;
 104         xfs_dinode_t    *dip;
 105
 106         j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
 107
 108         for (i = 0; i < j; i++) {
 109                 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 110                                         i * mp->m_sb.sb_inodesize);
 111                 if (!dip->di_next_unlinked)  {
 112                         xfs_alert(mp,
 113         "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
 114                                 bp);
 115                         ASSERT(dip->di_next_unlinked);
 116                 }
 117         }
 118 }
 119 #endif
 120
 121 /*
 122  * Find the buffer associated with the given inode map
 123  * We do basic validation checks on the buffer once it has been
 124  * retrieved from disk.
 125  */
 126 STATIC int
 127 xfs_imap_to_bp(
 128         xfs_mount_t     *mp,
 129         xfs_trans_t     *tp,
 130         struct xfs_imap *imap,
 131         xfs_buf_t       **bpp,
 132         uint            buf_flags,
 133         uint            iget_flags)
 134 {
 135         int             error;
 136         int             i;
 137         int             ni;
 138         xfs_buf_t       *bp;
 139
 140         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 141                                    (int)imap->im_len, buf_flags, &bp);
 142         if (error) {
 143                 if (error != EAGAIN) {
 144                         xfs_warn(mp,
 145                                 "%s: xfs_trans_read_buf() returned error %d.",
 146                                 __func__, error);
 147                 } else {
 148                         ASSERT(buf_flags & XBF_TRYLOCK);
 149                 }
 150                 return error;
 151         }
 152
 153         /*
 154          * Validate the magic number and version of every inode in the buffer
 155          * (if DEBUG kernel) or the first inode in the buffer, otherwise.
 156          */
 157 #ifdef DEBUG
 158         ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
 159 #else   /* usual case */
 160         ni = 1;
 161 #endif
 162
 163         for (i = 0; i < ni; i++) {
 164                 int             di_ok;
 165                 xfs_dinode_t    *dip;
 166
 167                 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 168                                         (i << mp->m_sb.sb_inodelog));
 169                 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
 170                             XFS_DINODE_GOOD_VERSION(dip->di_version);
 171                 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
 172                                                 XFS_ERRTAG_ITOBP_INOTOBP,
 173                                                 XFS_RANDOM_ITOBP_INOTOBP))) {
 174                         if (iget_flags & XFS_IGET_UNTRUSTED) {
 175                                 xfs_trans_brelse(tp, bp);
 176                                 return XFS_ERROR(EINVAL);
 177                         }
 178                         XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
 179                                                 XFS_ERRLEVEL_HIGH, mp, dip);
 180 #ifdef DEBUG
 181                         xfs_emerg(mp,
 182                                 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
 183                                 (unsigned long long)imap->im_blkno, i,
 184                                 be16_to_cpu(dip->di_magic));
 185                         ASSERT(0);
 186 #endif
 187                         xfs_trans_brelse(tp, bp);
 188                         return XFS_ERROR(EFSCORRUPTED);
 189                 }
 190         }
 191
 192         xfs_inobp_check(mp, bp);
 193         *bpp = bp;
 194         return 0;
 195 }
 196
 197 /*
 198  * This routine is called to map an inode number within a file
 199  * system to the buffer containing the on-disk version of the
 200  * inode.  It returns a pointer to the buffer containing the
 201  * on-disk inode in the bpp parameter, and in the dip parameter
 202  * it returns a pointer to the on-disk inode within that buffer.
 203  *
 204  * If a non-zero error is returned, then the contents of bpp and
 205  * dipp are undefined.
 206  *
 207  * Use xfs_imap() to determine the size and location of the
 208  * buffer to read from disk.
 209  */
 210 int
 211 xfs_inotobp(
 212         xfs_mount_t     *mp,
 213         xfs_trans_t     *tp,
 214         xfs_ino_t       ino,
 215         xfs_dinode_t    **dipp,
 216         xfs_buf_t       **bpp,
 217         int             *offset,
 218         uint            imap_flags)
 219 {
 220         struct xfs_imap imap;
 221         xfs_buf_t       *bp;
 222         int             error;
 223
 224         imap.im_blkno = 0;
 225         error = xfs_imap(mp, tp, ino, &imap, imap_flags);
 226         if (error)
 227                 return error;
 228
 229         error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags);
 230         if (error)
 231                 return error;
 232
 233         *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
 234         *bpp = bp;
 235         *offset = imap.im_boffset;
 236         return 0;
 237 }
 238
 239
 240 /*
 241  * This routine is called to map an inode to the buffer containing
 242  * the on-disk version of the inode.  It returns a pointer to the
 243  * buffer containing the on-disk inode in the bpp parameter, and in
 244  * the dip parameter it returns a pointer to the on-disk inode within
 245  * that buffer.
 246  *
 247  * If a non-zero error is returned, then the contents of bpp and
 248  * dipp are undefined.
 249  *
 250  * The inode is expected to already been mapped to its buffer and read
 251  * in once, thus we can use the mapping information stored in the inode
 252  * rather than calling xfs_imap().  This allows us to avoid the overhead
 253  * of looking at the inode btree for small block file systems
 254  * (see xfs_imap()).
 255  */
 256 int
 257 xfs_itobp(
 258         xfs_mount_t     *mp,
 259         xfs_trans_t     *tp,
 260         xfs_inode_t     *ip,
 261         xfs_dinode_t    **dipp,
 262         xfs_buf_t       **bpp,
 263         uint            buf_flags)
 264 {
 265         xfs_buf_t       *bp;
 266         int             error;
 267
 268         ASSERT(ip->i_imap.im_blkno != 0);
 269
 270         error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
 271         if (error)
 272                 return error;
 273
 274         if (!bp) {
 275                 ASSERT(buf_flags & XBF_TRYLOCK);
 276                 ASSERT(tp == NULL);
 277                 *bpp = NULL;
 278                 return EAGAIN;
 279         }
 280
 281         *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 282         *bpp = bp;
 283         return 0;
 284 }
 285
 286 /*
 287  * Move inode type and inode format specific information from the
 288  * on-disk inode to the in-core inode.  For fifos, devs, and sockets
 289  * this means set if_rdev to the proper value.  For files, directories,
 290  * and symlinks this means to bring in the in-line data or extent
 291  * pointers.  For a file in B-tree format, only the root is immediately
 292  * brought in-core.  The rest will be in-lined in if_extents when it
 293  * is first referenced (see xfs_iread_extents()).
 294  */
 295 STATIC int
 296 xfs_iformat(
 297         xfs_inode_t             *ip,
 298         xfs_dinode_t            *dip)
 299 {
 300         xfs_attr_shortform_t    *atp;
 301         int                     size;
 302         int                     error = 0;
 303         xfs_fsize_t             di_size;
 304
 305         if (unlikely(be32_to_cpu(dip->di_nextents) +
 306                      be16_to_cpu(dip->di_anextents) >
 307                      be64_to_cpu(dip->di_nblocks))) {
 308                 xfs_warn(ip->i_mount,
 309                         "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
 310                         (unsigned long long)ip->i_ino,
 311                         (int)(be32_to_cpu(dip->di_nextents) +
 312                               be16_to_cpu(dip->di_anextents)),
 313                         (unsigned long long)
 314                                 be64_to_cpu(dip->di_nblocks));
 315                 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
 316                                      ip->i_mount, dip);
 317                 return XFS_ERROR(EFSCORRUPTED);
 318         }
 319
 320         if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
 321                 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
 322                         (unsigned long long)ip->i_ino,
 323                         dip->di_forkoff);
 324                 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
 325                                      ip->i_mount, dip);
 326                 return XFS_ERROR(EFSCORRUPTED);
 327         }
 328
 329         if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
 330                      !ip->i_mount->m_rtdev_targp)) {
 331                 xfs_warn(ip->i_mount,
 332                         "corrupt dinode %Lu, has realtime flag set.",
 333                         ip->i_ino);
 334                 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
 335                                      XFS_ERRLEVEL_LOW, ip->i_mount, dip);
 336                 return XFS_ERROR(EFSCORRUPTED);
 337         }
 338
 339         switch (ip->i_d.di_mode & S_IFMT) {
 340         case S_IFIFO:
 341         case S_IFCHR:
 342         case S_IFBLK:
 343         case S_IFSOCK:
 344                 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
 345                         XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
 346                                               ip->i_mount, dip);
 347                         return XFS_ERROR(EFSCORRUPTED);
 348                 }
 349                 ip->i_d.di_size = 0;
 350                 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
 351                 break;
 352
 353         case S_IFREG:
 354         case S_IFLNK:
 355         case S_IFDIR:
 356                 switch (dip->di_format) {
 357                 case XFS_DINODE_FMT_LOCAL:
 358                         /*
 359                          * no local regular files yet
 360                          */
 361                         if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
 362                                 xfs_warn(ip->i_mount,
 363                         "corrupt inode %Lu (local format for regular file).",
 364                                         (unsigned long long) ip->i_ino);
 365                                 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
 366                                                      XFS_ERRLEVEL_LOW,
 367                                                      ip->i_mount, dip);
 368                                 return XFS_ERROR(EFSCORRUPTED);
 369                         }
 370
 371                         di_size = be64_to_cpu(dip->di_size);
 372                         if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
 373                                 xfs_warn(ip->i_mount,
 374                         "corrupt inode %Lu (bad size %Ld for local inode).",
 375                                         (unsigned long long) ip->i_ino,
 376                                         (long long) di_size);
 377                                 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
 378                                                      XFS_ERRLEVEL_LOW,
 379                                                      ip->i_mount, dip);
 380                                 return XFS_ERROR(EFSCORRUPTED);
 381                         }
 382
 383                         size = (int)di_size;
 384                         error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
 385                         break;
 386                 case XFS_DINODE_FMT_EXTENTS:
 387                         error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
 388                         break;
 389                 case XFS_DINODE_FMT_BTREE:
 390                         error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
 391                         break;
 392                 default:
 393                         XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
 394                                          ip->i_mount);
 395                         return XFS_ERROR(EFSCORRUPTED);
 396                 }
 397                 break;
 398
 399         default:
 400                 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
 401                 return XFS_ERROR(EFSCORRUPTED);
 402         }
 403         if (error) {
 404                 return error;
 405         }
 406         if (!XFS_DFORK_Q(dip))
 407                 return 0;
 408
 409         ASSERT(ip->i_afp == NULL);
 410         ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
 411
 412         switch (dip->di_aformat) {
 413         case XFS_DINODE_FMT_LOCAL:
 414                 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
 415                 size = be16_to_cpu(atp->hdr.totsize);
 416
 417                 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
 418                         xfs_warn(ip->i_mount,
 419                                 "corrupt inode %Lu (bad attr fork size %Ld).",
 420                                 (unsigned long long) ip->i_ino,
 421                                 (long long) size);
 422                         XFS_CORRUPTION_ERROR("xfs_iformat(8)",
 423                                              XFS_ERRLEVEL_LOW,
 424                                              ip->i_mount, dip);
 425                         return XFS_ERROR(EFSCORRUPTED);
 426                 }
 427
 428                 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
 429                 break;
 430         case XFS_DINODE_FMT_EXTENTS:
 431                 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
 432                 break;
 433         case XFS_DINODE_FMT_BTREE:
 434                 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
 435                 break;
 436         default:
 437                 error = XFS_ERROR(EFSCORRUPTED);
 438                 break;
 439         }
 440         if (error) {
 441                 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 442                 ip->i_afp = NULL;
 443                 xfs_idestroy_fork(ip, XFS_DATA_FORK);
 444         }
 445         return error;
 446 }
 447
 448 /*
 449  * The file is in-lined in the on-disk inode.
 450  * If it fits into if_inline_data, then copy
 451  * it there, otherwise allocate a buffer for it
 452  * and copy the data there.  Either way, set
 453  * if_data to point at the data.
 454  * If we allocate a buffer for the data, make
 455  * sure that its size is a multiple of 4 and
 456  * record the real size in i_real_bytes.
 457  */
 458 STATIC int
 459 xfs_iformat_local(
 460         xfs_inode_t     *ip,
 461         xfs_dinode_t    *dip,
 462         int             whichfork,
 463         int             size)
 464 {
 465         xfs_ifork_t     *ifp;
 466         int             real_size;
 467
 468         /*
 469          * If the size is unreasonable, then something
 470          * is wrong and we just bail out rather than crash in
 471          * kmem_alloc() or memcpy() below.
 472          */
 473         if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 474                 xfs_warn(ip->i_mount,
 475         "corrupt inode %Lu (bad size %d for local fork, size = %d).",
 476                         (unsigned long long) ip->i_ino, size,
 477                         XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
 478                 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
 479                                      ip->i_mount, dip);
 480                 return XFS_ERROR(EFSCORRUPTED);
 481         }
 482         ifp = XFS_IFORK_PTR(ip, whichfork);
 483         real_size = 0;
 484         if (size == 0)
 485                 ifp->if_u1.if_data = NULL;
 486         else if (size <= sizeof(ifp->if_u2.if_inline_data))
 487                 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 488         else {
 489                 real_size = roundup(size, 4);
 490                 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
 491         }
 492         ifp->if_bytes = size;
 493         ifp->if_real_bytes = real_size;
 494         if (size)
 495                 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
 496         ifp->if_flags &= ~XFS_IFEXTENTS;
 497         ifp->if_flags |= XFS_IFINLINE;
 498         return 0;
 499 }
 500
 501 /*
 502  * The file consists of a set of extents all
 503  * of which fit into the on-disk inode.
 504  * If there are few enough extents to fit into
 505  * the if_inline_ext, then copy them there.
 506  * Otherwise allocate a buffer for them and copy
 507  * them into it.  Either way, set if_extents
 508  * to point at the extents.
 509  */
 510 STATIC int
 511 xfs_iformat_extents(
 512         xfs_inode_t     *ip,
 513         xfs_dinode_t    *dip,
 514         int             whichfork)
 515 {
 516         xfs_bmbt_rec_t  *dp;
 517         xfs_ifork_t     *ifp;
 518         int             nex;
 519         int             size;
 520         int             i;
 521
 522         ifp = XFS_IFORK_PTR(ip, whichfork);
 523         nex = XFS_DFORK_NEXTENTS(dip, whichfork);
 524         size = nex * (uint)sizeof(xfs_bmbt_rec_t);
 525
 526         /*
 527          * If the number of extents is unreasonable, then something
 528          * is wrong and we just bail out rather than crash in
 529          * kmem_alloc() or memcpy() below.
 530          */
 531         if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 532                 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
 533                         (unsigned long long) ip->i_ino, nex);
 534                 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
 535                                      ip->i_mount, dip);
 536                 return XFS_ERROR(EFSCORRUPTED);
 537         }
 538
 539         ifp->if_real_bytes = 0;
 540         if (nex == 0)
 541                 ifp->if_u1.if_extents = NULL;
 542         else if (nex <= XFS_INLINE_EXTS)
 543                 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 544         else
 545                 xfs_iext_add(ifp, 0, nex);
 546
 547         ifp->if_bytes = size;
 548         if (size) {
 549                 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
 550                 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
 551                 for (i = 0; i < nex; i++, dp++) {
 552                         xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 553                         ep->l0 = get_unaligned_be64(&dp->l0);
 554                         ep->l1 = get_unaligned_be64(&dp->l1);
 555                 }
 556                 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
 557                 if (whichfork != XFS_DATA_FORK ||
 558                         XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
 559                                 if (unlikely(xfs_check_nostate_extents(
 560                                     ifp, 0, nex))) {
 561                                         XFS_ERROR_REPORT("xfs_iformat_extents(2)",
 562                                                          XFS_ERRLEVEL_LOW,
 563                                                          ip->i_mount);
 564                                         return XFS_ERROR(EFSCORRUPTED);
 565                                 }
 566         }
 567         ifp->if_flags |= XFS_IFEXTENTS;
 568         return 0;
 569 }
 570
 571 /*
 572  * The file has too many extents to fit into
 573  * the inode, so they are in B-tree format.
 574  * Allocate a buffer for the root of the B-tree
 575  * and copy the root into it.  The i_extents
 576  * field will remain NULL until all of the
 577  * extents are read in (when they are needed).
 578  */
 579 STATIC int
 580 xfs_iformat_btree(
 581         xfs_inode_t             *ip,
 582         xfs_dinode_t            *dip,
 583         int                     whichfork)
 584 {
 585         xfs_bmdr_block_t        *dfp;
 586         xfs_ifork_t             *ifp;
 587         /* REFERENCED */
 588         int                     nrecs;
 589         int                     size;
 590
 591         ifp = XFS_IFORK_PTR(ip, whichfork);
 592         dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 593         size = XFS_BMAP_BROOT_SPACE(dfp);
 594         nrecs = be16_to_cpu(dfp->bb_numrecs);
 595
 596         /*
 597          * blow out if -- fork has less extents than can fit in
 598          * fork (fork shouldn't be a btree format), root btree
 599          * block has more records than can fit into the fork,
 600          * or the number of extents is greater than the number of
 601          * blocks.
 602          */
 603         if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
 604                         XFS_IFORK_MAXEXT(ip, whichfork) ||
 605                      XFS_BMDR_SPACE_CALC(nrecs) >
 606                         XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
 607                      XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
 608                 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
 609                         (unsigned long long) ip->i_ino);
 610                 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
 611                                  ip->i_mount, dip);
 612                 return XFS_ERROR(EFSCORRUPTED);
 613         }
 614
 615         ifp->if_broot_bytes = size;
 616         ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
 617         ASSERT(ifp->if_broot != NULL);
 618         /*
 619          * Copy and convert from the on-disk structure
 620          * to the in-memory structure.
 621          */
 622         xfs_bmdr_to_bmbt(ip->i_mount, dfp,
 623                          XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
 624                          ifp->if_broot, size);
 625         ifp->if_flags &= ~XFS_IFEXTENTS;
 626         ifp->if_flags |= XFS_IFBROOT;
 627
 628         return 0;
 629 }
 630
 631 STATIC void
 632 xfs_dinode_from_disk(
 633         xfs_icdinode_t          *to,
 634         xfs_dinode_t            *from)
 635 {
 636         to->di_magic = be16_to_cpu(from->di_magic);
 637         to->di_mode = be16_to_cpu(from->di_mode);
 638         to->di_version = from ->di_version;
 639         to->di_format = from->di_format;
 640         to->di_onlink = be16_to_cpu(from->di_onlink);
 641         to->di_uid = be32_to_cpu(from->di_uid);
 642         to->di_gid = be32_to_cpu(from->di_gid);
 643         to->di_nlink = be32_to_cpu(from->di_nlink);
 644         to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
 645         to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 646         memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 647         to->di_flushiter = be16_to_cpu(from->di_flushiter);
 648         to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
 649         to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
 650         to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
 651         to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
 652         to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
 653         to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
 654         to->di_size = be64_to_cpu(from->di_size);
 655         to->di_nblocks = be64_to_cpu(from->di_nblocks);
 656         to->di_extsize = be32_to_cpu(from->di_extsize);
 657         to->di_nextents = be32_to_cpu(from->di_nextents);
 658         to->di_anextents = be16_to_cpu(from->di_anextents);
 659         to->di_forkoff = from->di_forkoff;
 660         to->di_aformat  = from->di_aformat;
 661         to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
 662         to->di_dmstate  = be16_to_cpu(from->di_dmstate);
 663         to->di_flags    = be16_to_cpu(from->di_flags);
 664         to->di_gen      = be32_to_cpu(from->di_gen);
 665 }
 666
 667 void
 668 xfs_dinode_to_disk(
 669         xfs_dinode_t            *to,
 670         xfs_icdinode_t          *from)
 671 {
 672         to->di_magic = cpu_to_be16(from->di_magic);
 673         to->di_mode = cpu_to_be16(from->di_mode);
 674         to->di_version = from ->di_version;
 675         to->di_format = from->di_format;
 676         to->di_onlink = cpu_to_be16(from->di_onlink);
 677         to->di_uid = cpu_to_be32(from->di_uid);
 678         to->di_gid = cpu_to_be32(from->di_gid);
 679         to->di_nlink = cpu_to_be32(from->di_nlink);
 680         to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 681         to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 682         memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 683         to->di_flushiter = cpu_to_be16(from->di_flushiter);
 684         to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 685         to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 686         to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
 687         to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
 688         to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
 689         to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
 690         to->di_size = cpu_to_be64(from->di_size);
 691         to->di_nblocks = cpu_to_be64(from->di_nblocks);
 692         to->di_extsize = cpu_to_be32(from->di_extsize);
 693         to->di_nextents = cpu_to_be32(from->di_nextents);
 694         to->di_anextents = cpu_to_be16(from->di_anextents);
 695         to->di_forkoff = from->di_forkoff;
 696         to->di_aformat = from->di_aformat;
 697         to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 698         to->di_dmstate = cpu_to_be16(from->di_dmstate);
 699         to->di_flags = cpu_to_be16(from->di_flags);
 700         to->di_gen = cpu_to_be32(from->di_gen);
 701 }
 702
 703 STATIC uint
 704 _xfs_dic2xflags(
 705         __uint16_t              di_flags)
 706 {
 707         uint                    flags = 0;
 708
 709         if (di_flags & XFS_DIFLAG_ANY) {
 710                 if (di_flags & XFS_DIFLAG_REALTIME)
 711                         flags |= XFS_XFLAG_REALTIME;
 712                 if (di_flags & XFS_DIFLAG_PREALLOC)
 713                         flags |= XFS_XFLAG_PREALLOC;
 714                 if (di_flags & XFS_DIFLAG_IMMUTABLE)
 715                         flags |= XFS_XFLAG_IMMUTABLE;
 716                 if (di_flags & XFS_DIFLAG_APPEND)
 717                         flags |= XFS_XFLAG_APPEND;
 718                 if (di_flags & XFS_DIFLAG_SYNC)
 719                         flags |= XFS_XFLAG_SYNC;
 720                 if (di_flags & XFS_DIFLAG_NOATIME)
 721                         flags |= XFS_XFLAG_NOATIME;
 722                 if (di_flags & XFS_DIFLAG_NODUMP)
 723                         flags |= XFS_XFLAG_NODUMP;
 724                 if (di_flags & XFS_DIFLAG_RTINHERIT)
 725                         flags |= XFS_XFLAG_RTINHERIT;
 726                 if (di_flags & XFS_DIFLAG_PROJINHERIT)
 727                         flags |= XFS_XFLAG_PROJINHERIT;
 728                 if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 729                         flags |= XFS_XFLAG_NOSYMLINKS;
 730                 if (di_flags & XFS_DIFLAG_EXTSIZE)
 731                         flags |= XFS_XFLAG_EXTSIZE;
 732                 if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 733                         flags |= XFS_XFLAG_EXTSZINHERIT;
 734                 if (di_flags & XFS_DIFLAG_NODEFRAG)
 735                         flags |= XFS_XFLAG_NODEFRAG;
 736                 if (di_flags & XFS_DIFLAG_FILESTREAM)
 737                         flags |= XFS_XFLAG_FILESTREAM;
 738         }
 739
 740         return flags;
 741 }
 742
 743 uint
 744 xfs_ip2xflags(
 745         xfs_inode_t             *ip)
 746 {
 747         xfs_icdinode_t          *dic = &ip->i_d;
 748
 749         return _xfs_dic2xflags(dic->di_flags) |
 750                                 (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
 751 }
 752
 753 uint
 754 xfs_dic2xflags(
 755         xfs_dinode_t            *dip)
 756 {
 757         return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
 758                                 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 759 }
 760
 761 /*
 762  * Read the disk inode attributes into the in-core inode structure.
 763  */
 764 int
 765 xfs_iread(
 766         xfs_mount_t     *mp,
 767         xfs_trans_t     *tp,
 768         xfs_inode_t     *ip,
 769         uint            iget_flags)
 770 {
 771         xfs_buf_t       *bp;
 772         xfs_dinode_t    *dip;
 773         int             error;
 774
 775         /*
 776          * Fill in the location information in the in-core inode.
 777          */
 778         error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
 779         if (error)
 780                 return error;
 781
 782         /*
 783          * Get pointers to the on-disk inode and the buffer containing it.
 784          */
 785         error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags);
 786         if (error)
 787                 return error;
 788         dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 789
 790         /*
 791          * If we got something that isn't an inode it means someone
 792          * (nfs or dmi) has a stale handle.
 793          */
 794         if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) {
 795 #ifdef DEBUG
 796                 xfs_alert(mp,
 797                         "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
 798                         __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
 799 #endif /* DEBUG */
 800                 error = XFS_ERROR(EINVAL);
 801                 goto out_brelse;
 802         }
 803
 804         /*
 805          * If the on-disk inode is already linked to a directory
 806          * entry, copy all of the inode into the in-core inode.
 807          * xfs_iformat() handles copying in the inode format
 808          * specific information.
 809          * Otherwise, just get the truly permanent information.
 810          */
 811         if (dip->di_mode) {
 812                 xfs_dinode_from_disk(&ip->i_d, dip);
 813                 error = xfs_iformat(ip, dip);
 814                 if (error)  {
 815 #ifdef DEBUG
 816                         xfs_alert(mp, "%s: xfs_iformat() returned error %d",
 817                                 __func__, error);
 818 #endif /* DEBUG */
 819                         goto out_brelse;
 820                 }
 821         } else {
 822                 ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
 823                 ip->i_d.di_version = dip->di_version;
 824                 ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
 825                 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
 826                 /*
 827                  * Make sure to pull in the mode here as well in
 828                  * case the inode is released without being used.
 829                  * This ensures that xfs_inactive() will see that
 830                  * the inode is already free and not try to mess
 831                  * with the uninitialized part of it.
 832                  */
 833                 ip->i_d.di_mode = 0;
 834         }
 835
 836         /*
 837          * The inode format changed when we moved the link count and
 838          * made it 32 bits long.  If this is an old format inode,
 839          * convert it in memory to look like a new one.  If it gets
 840          * flushed to disk we will convert back before flushing or
 841          * logging it.  We zero out the new projid field and the old link
 842          * count field.  We'll handle clearing the pad field (the remains
 843          * of the old uuid field) when we actually convert the inode to
 844          * the new format. We don't change the version number so that we
 845          * can distinguish this from a real new format inode.
 846          */
 847         if (ip->i_d.di_version == 1) {
 848                 ip->i_d.di_nlink = ip->i_d.di_onlink;
 849                 ip->i_d.di_onlink = 0;
 850                 xfs_set_projid(ip, 0);
 851         }
 852
 853         ip->i_delayed_blks = 0;
 854
 855         /*
 856          * Mark the buffer containing the inode as something to keep
 857          * around for a while.  This helps to keep recently accessed
 858          * meta-data in-core longer.
 859          */
 860         xfs_buf_set_ref(bp, XFS_INO_REF);
 861
 862         /*
 863          * Use xfs_trans_brelse() to release the buffer containing the
 864          * on-disk inode, because it was acquired with xfs_trans_read_buf()
 865          * in xfs_itobp() above.  If tp is NULL, this is just a normal
 866          * brelse().  If we're within a transaction, then xfs_trans_brelse()
 867          * will only release the buffer if it is not dirty within the
 868          * transaction.  It will be OK to release the buffer in this case,
 869          * because inodes on disk are never destroyed and we will be
 870          * locking the new in-core inode before putting it in the hash
 871          * table where other processes can find it.  Thus we don't have
 872          * to worry about the inode being changed just because we released
 873          * the buffer.
 874          */
 875  out_brelse:
 876         xfs_trans_brelse(tp, bp);
 877         return error;
 878 }
 879
 880 /*
 881  * Read in extents from a btree-format inode.
 882  * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
 883  */
 884 int
 885 xfs_iread_extents(
 886         xfs_trans_t     *tp,
 887         xfs_inode_t     *ip,
 888         int             whichfork)
 889 {
 890         int             error;
 891         xfs_ifork_t     *ifp;
 892         xfs_extnum_t    nextents;
 893
 894         if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
 895                 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
 896                                  ip->i_mount);
 897                 return XFS_ERROR(EFSCORRUPTED);
 898         }
 899         nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
 900         ifp = XFS_IFORK_PTR(ip, whichfork);
 901
 902         /*
 903          * We know that the size is valid (it's checked in iformat_btree)
 904          */
 905         ifp->if_bytes = ifp->if_real_bytes = 0;
 906         ifp->if_flags |= XFS_IFEXTENTS;
 907         xfs_iext_add(ifp, 0, nextents);
 908         error = xfs_bmap_read_extents(tp, ip, whichfork);
 909         if (error) {
 910                 xfs_iext_destroy(ifp);
 911                 ifp->if_flags &= ~XFS_IFEXTENTS;
 912                 return error;
 913         }
 914         xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
 915         return 0;
 916 }
 917
 918 /*
 919  * Allocate an inode on disk and return a copy of its in-core version.
 920  * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
 921  * appropriately within the inode.  The uid and gid for the inode are
 922  * set according to the contents of the given cred structure.
 923  *
 924  * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
 925  * has a free inode available, call xfs_iget()
 926  * to obtain the in-core version of the allocated inode.  Finally,
 927  * fill in the inode and log its initial contents.  In this case,
 928  * ialloc_context would be set to NULL and call_again set to false.
 929  *
 930  * If xfs_dialloc() does not have an available inode,
 931  * it will replenish its supply by doing an allocation. Since we can
 932  * only do one allocation within a transaction without deadlocks, we
 933  * must commit the current transaction before returning the inode itself.
 934  * In this case, therefore, we will set call_again to true and return.
 935  * The caller should then commit the current transaction, start a new
 936  * transaction, and call xfs_ialloc() again to actually get the inode.
 937  *
 938  * To ensure that some other process does not grab the inode that
 939  * was allocated during the first call to xfs_ialloc(), this routine
 940  * also returns the [locked] bp pointing to the head of the freelist
 941  * as ialloc_context.  The caller should hold this buffer across
 942  * the commit and pass it back into this routine on the second call.
 943  *
 944  * If we are allocating quota inodes, we do not have a parent inode
 945  * to attach to or associate with (i.e. pip == NULL) because they
 946  * are not linked into the directory structure - they are attached
 947  * directly to the superblock - and so have no parent.
 948  */
 949 int
 950 xfs_ialloc(
 951         xfs_trans_t     *tp,
 952         xfs_inode_t     *pip,
 953         umode_t         mode,
 954         xfs_nlink_t     nlink,
 955         xfs_dev_t       rdev,
 956         prid_t          prid,
 957         int             okalloc,
 958         xfs_buf_t       **ialloc_context,
 959         boolean_t       *call_again,
 960         xfs_inode_t     **ipp)
 961 {
 962         xfs_ino_t       ino;
 963         xfs_inode_t     *ip;
 964         uint            flags;
 965         int             error;
 966         timespec_t      tv;
 967         int             filestreams = 0;
 968
 969         /*
 970          * Call the space management code to pick
 971          * the on-disk inode to be allocated.
 972          */
 973         error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
 974                             ialloc_context, call_again, &ino);
 975         if (error)
 976                 return error;
 977         if (*call_again || ino == NULLFSINO) {
 978                 *ipp = NULL;
 979                 return 0;
 980         }
 981         ASSERT(*ialloc_context == NULL);
 982
 983         /*
 984          * Get the in-core inode with the lock held exclusively.
 985          * This is because we're setting fields here we need
 986          * to prevent others from looking at until we're done.
 987          */
 988         error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
 989                          XFS_ILOCK_EXCL, &ip);
 990         if (error)
 991                 return error;
 992         ASSERT(ip != NULL);
 993
 994         ip->i_d.di_mode = mode;
 995         ip->i_d.di_onlink = 0;
 996         ip->i_d.di_nlink = nlink;
 997         ASSERT(ip->i_d.di_nlink == nlink);
 998         ip->i_d.di_uid = current_fsuid();
 999         ip->i_d.di_gid = current_fsgid();
1000         xfs_set_projid(ip, prid);
1001         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1002
1003         /*
1004          * If the superblock version is up to where we support new format
1005          * inodes and this is currently an old format inode, then change
1006          * the inode version number now.  This way we only do the conversion
1007          * here rather than here and in the flush/logging code.
1008          */
1009         if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
1010             ip->i_d.di_version == 1) {
1011                 ip->i_d.di_version = 2;
1012                 /*
1013                  * We've already zeroed the old link count, the projid field,
1014                  * and the pad field.
1015                  */
1016         }
1017
1018         /*
1019          * Project ids won't be stored on disk if we are using a version 1 inode.
1020          */
1021         if ((prid != 0) && (ip->i_d.di_version == 1))
1022                 xfs_bump_ino_vers2(tp, ip);
1023
1024         if (pip && XFS_INHERIT_GID(pip)) {
1025                 ip->i_d.di_gid = pip->i_d.di_gid;
1026                 if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
1027                         ip->i_d.di_mode |= S_ISGID;
1028                 }
1029         }
1030
1031         /*
1032          * If the group ID of the new file does not match the effective group
1033          * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
1034          * (and only if the irix_sgid_inherit compatibility variable is set).
1035          */
1036         if ((irix_sgid_inherit) &&
1037             (ip->i_d.di_mode & S_ISGID) &&
1038             (!in_group_p((gid_t)ip->i_d.di_gid))) {
1039                 ip->i_d.di_mode &= ~S_ISGID;
1040         }
1041
1042         ip->i_d.di_size = 0;
1043         ip->i_d.di_nextents = 0;
1044         ASSERT(ip->i_d.di_nblocks == 0);
1045
1046         nanotime(&tv);
1047         ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
1048         ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
1049         ip->i_d.di_atime = ip->i_d.di_mtime;
1050         ip->i_d.di_ctime = ip->i_d.di_mtime;
1051
1052         /*
1053          * di_gen will have been taken care of in xfs_iread.
1054          */
1055         ip->i_d.di_extsize = 0;
1056         ip->i_d.di_dmevmask = 0;
1057         ip->i_d.di_dmstate = 0;
1058         ip->i_d.di_flags = 0;
1059         flags = XFS_ILOG_CORE;
1060         switch (mode & S_IFMT) {
1061         case S_IFIFO:
1062         case S_IFCHR:
1063         case S_IFBLK:
1064         case S_IFSOCK:
1065                 ip->i_d.di_format = XFS_DINODE_FMT_DEV;
1066                 ip->i_df.if_u2.if_rdev = rdev;
1067                 ip->i_df.if_flags = 0;
1068                 flags |= XFS_ILOG_DEV;
1069                 break;
1070         case S_IFREG:
1071                 /*
1072                  * we can't set up filestreams until after the VFS inode
1073                  * is set up properly.
1074                  */
1075                 if (pip && xfs_inode_is_filestream(pip))
1076                         filestreams = 1;
1077                 /* fall through */
1078         case S_IFDIR:
1079                 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1080                         uint    di_flags = 0;
1081
1082                         if (S_ISDIR(mode)) {
1083                                 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1084                                         di_flags |= XFS_DIFLAG_RTINHERIT;
1085                                 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1086                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
1087                                         ip->i_d.di_extsize = pip->i_d.di_extsize;
1088                                 }
1089                         } else if (S_ISREG(mode)) {
1090                                 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1091                                         di_flags |= XFS_DIFLAG_REALTIME;
1092                                 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1093                                         di_flags |= XFS_DIFLAG_EXTSIZE;
1094                                         ip->i_d.di_extsize = pip->i_d.di_extsize;
1095                                 }
1096                         }
1097                         if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
1098                             xfs_inherit_noatime)
1099                                 di_flags |= XFS_DIFLAG_NOATIME;
1100                         if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
1101                             xfs_inherit_nodump)
1102                                 di_flags |= XFS_DIFLAG_NODUMP;
1103                         if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
1104                             xfs_inherit_sync)
1105                                 di_flags |= XFS_DIFLAG_SYNC;
1106                         if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
1107                             xfs_inherit_nosymlinks)
1108                                 di_flags |= XFS_DIFLAG_NOSYMLINKS;
1109                         if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1110                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
1111                         if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
1112                             xfs_inherit_nodefrag)
1113                                 di_flags |= XFS_DIFLAG_NODEFRAG;
1114                         if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
1115                                 di_flags |= XFS_DIFLAG_FILESTREAM;
1116                         ip->i_d.di_flags |= di_flags;
1117                 }
1118                 /* FALLTHROUGH */
1119         case S_IFLNK:
1120                 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1121                 ip->i_df.if_flags = XFS_IFEXTENTS;
1122                 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
1123                 ip->i_df.if_u1.if_extents = NULL;
1124                 break;
1125         default:
1126                 ASSERT(0);
1127         }
1128         /*
1129          * Attribute fork settings for new inode.
1130          */
1131         ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1132         ip->i_d.di_anextents = 0;
1133
1134         /*
1135          * Log the new values stuffed into the inode.
1136          */
1137         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1138         xfs_trans_log_inode(tp, ip, flags);
1139
1140         /* now that we have an i_mode we can setup inode ops and unlock */
1141         xfs_setup_inode(ip);
1142
1143         /* now we have set up the vfs inode we can associate the filestream */
1144         if (filestreams) {
1145                 error = xfs_filestream_associate(pip, ip);
1146                 if (error < 0)
1147                         return -error;
1148                 if (!error)
1149                         xfs_iflags_set(ip, XFS_IFILESTREAM);
1150         }
1151
1152         *ipp = ip;
1153         return 0;
1154 }
1155
1156 /*
1157  * Free up the underlying blocks past new_size.  The new size must be smaller
1158  * than the current size.  This routine can be used both for the attribute and
1159  * data fork, and does not modify the inode size, which is left to the caller.
1160  *
1161  * The transaction passed to this routine must have made a permanent log
1162  * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1163  * given transaction and start new ones, so make sure everything involved in
1164  * the transaction is tidy before calling here.  Some transaction will be
1165  * returned to the caller to be committed.  The incoming transaction must
1166  * already include the inode, and both inode locks must be held exclusively.
1167  * The inode must also be "held" within the transaction.  On return the inode
1168  * will be "held" within the returned transaction.  This routine does NOT
1169  * require any disk space to be reserved for it within the transaction.
1170  *
1171  * If we get an error, we must return with the inode locked and linked into the
1172  * current transaction. This keeps things simple for the higher level code,
1173  * because it always knows that the inode is locked and held in the transaction
1174  * that returns to it whether errors occur or not.  We don't mark the inode
1175  * dirty on error so that transactions can be easily aborted if possible.
1176  */
1177 int
1178 xfs_itruncate_extents(
1179         struct xfs_trans        **tpp,
1180         struct xfs_inode        *ip,
1181         int                     whichfork,
1182         xfs_fsize_t             new_size)
1183 {
1184         struct xfs_mount        *mp = ip->i_mount;
1185         struct xfs_trans        *tp = *tpp;
1186         struct xfs_trans        *ntp;
1187         xfs_bmap_free_t         free_list;
1188         xfs_fsblock_t           first_block;
1189         xfs_fileoff_t           first_unmap_block;
1190         xfs_fileoff_t           last_block;
1191         xfs_filblks_t           unmap_len;
1192         int                     committed;
1193         int                     error = 0;
1194         int                     done = 0;
1195
1196         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
1197         ASSERT(new_size <= XFS_ISIZE(ip));
1198         ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1199         ASSERT(ip->i_itemp != NULL);
1200         ASSERT(ip->i_itemp->ili_lock_flags == 0);
1201         ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1202
1203         trace_xfs_itruncate_extents_start(ip, new_size);
1204
1205         /*
1206          * Since it is possible for space to become allocated beyond
1207          * the end of the file (in a crash where the space is allocated
1208          * but the inode size is not yet updated), simply remove any
1209          * blocks which show up between the new EOF and the maximum
1210          * possible file size.  If the first block to be removed is
1211          * beyond the maximum file size (ie it is the same as last_block),
1212          * then there is nothing to do.
1213          */
1214         first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1215         last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1216         if (first_unmap_block == last_block)
1217                 return 0;
1218
1219         ASSERT(first_unmap_block < last_block);
1220         unmap_len = last_block - first_unmap_block + 1;
1221         while (!done) {
1222                 xfs_bmap_init(&free_list, &first_block);
1223                 error = xfs_bunmapi(tp, ip,
1224                                     first_unmap_block, unmap_len,
1225                                     xfs_bmapi_aflag(whichfork),
1226                                     XFS_ITRUNC_MAX_EXTENTS,
1227                                     &first_block, &free_list,
1228                                     &done);
1229                 if (error)
1230                         goto out_bmap_cancel;
1231
1232                 /*
1233                  * Duplicate the transaction that has the permanent
1234                  * reservation and commit the old transaction.
1235                  */
1236                 error = xfs_bmap_finish(&tp, &free_list, &committed);
1237                 if (committed)
1238                         xfs_trans_ijoin(tp, ip, 0);
1239                 if (error)
1240                         goto out_bmap_cancel;
1241
1242                 if (committed) {
1243                         /*
1244                          * Mark the inode dirty so it will be logged and
1245                          * moved forward in the log as part of every commit.
1246                          */
1247                         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1248                 }
1249
1250                 ntp = xfs_trans_dup(tp);
1251                 error = xfs_trans_commit(tp, 0);
1252                 tp = ntp;
1253
1254                 xfs_trans_ijoin(tp, ip, 0);
1255
1256                 if (error)
1257                         goto out;
1258
1259                 /*
1260                  * Transaction commit worked ok so we can drop the extra ticket
1261                  * reference that we gained in xfs_trans_dup()
1262                  */
1263                 xfs_log_ticket_put(tp->t_ticket);
1264                 error = xfs_trans_reserve(tp, 0,
1265                                         XFS_ITRUNCATE_LOG_RES(mp), 0,
1266                                         XFS_TRANS_PERM_LOG_RES,
1267                                         XFS_ITRUNCATE_LOG_COUNT);
1268                 if (error)
1269                         goto out;
1270         }
1271
1272         /*
1273          * Always re-log the inode so that our permanent transaction can keep
1274          * on rolling it forward in the log.
1275          */
1276         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1277
1278         trace_xfs_itruncate_extents_end(ip, new_size);
1279
1280 out:
1281         *tpp = tp;
1282         return error;
1283 out_bmap_cancel:
1284         /*
1285          * If the bunmapi call encounters an error, return to the caller where
1286          * the transaction can be properly aborted.  We just need to make sure
1287          * we're not holding any resources that we were not when we came in.
1288          */
1289         xfs_bmap_cancel(&free_list);
1290         goto out;
1291 }
1292
1293 /*
1294  * This is called when the inode's link count goes to 0.
1295  * We place the on-disk inode on a list in the AGI.  It
1296  * will be pulled from this list when the inode is freed.
1297  */
1298 int
1299 xfs_iunlink(
1300         xfs_trans_t     *tp,
1301         xfs_inode_t     *ip)
1302 {
1303         xfs_mount_t     *mp;
1304         xfs_agi_t       *agi;
1305         xfs_dinode_t    *dip;
1306         xfs_buf_t       *agibp;
1307         xfs_buf_t       *ibp;
1308         xfs_agino_t     agino;
1309         short           bucket_index;
1310         int             offset;
1311         int             error;
1312
1313         ASSERT(ip->i_d.di_nlink == 0);
1314         ASSERT(ip->i_d.di_mode != 0);
1315
1316         mp = tp->t_mountp;
1317
1318         /*
1319          * Get the agi buffer first.  It ensures lock ordering
1320          * on the list.
1321          */
1322         error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
1323         if (error)
1324                 return error;
1325         agi = XFS_BUF_TO_AGI(agibp);
1326
1327         /*
1328          * Get the index into the agi hash table for the
1329          * list this inode will go on.
1330          */
1331         agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1332         ASSERT(agino != 0);
1333         bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1334         ASSERT(agi->agi_unlinked[bucket_index]);
1335         ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1336
1337         if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
1338                 /*
1339                  * There is already another inode in the bucket we need
1340                  * to add ourselves to.  Add us at the front of the list.
1341                  * Here we put the head pointer into our next pointer,
1342                  * and then we fall through to point the head at us.
1343                  */
1344                 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
1345                 if (error)
1346                         return error;
1347
1348                 ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
1349                 dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1350                 offset = ip->i_imap.im_boffset +
1351                         offsetof(xfs_dinode_t, di_next_unlinked);
1352                 xfs_trans_inode_buf(tp, ibp);
1353                 xfs_trans_log_buf(tp, ibp, offset,
1354                                   (offset + sizeof(xfs_agino_t) - 1));
1355                 xfs_inobp_check(mp, ibp);
1356         }
1357
1358         /*
1359          * Point the bucket head pointer at the inode being inserted.
1360          */
1361         ASSERT(agino != 0);
1362         agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1363         offset = offsetof(xfs_agi_t, agi_unlinked) +
1364                 (sizeof(xfs_agino_t) * bucket_index);
1365         xfs_trans_log_buf(tp, agibp, offset,
1366                           (offset + sizeof(xfs_agino_t) - 1));
1367         return 0;
1368 }
1369
1370 /*
1371  * Pull the on-disk inode from the AGI unlinked list.
1372  */
1373 STATIC int
1374 xfs_iunlink_remove(
1375         xfs_trans_t     *tp,
1376         xfs_inode_t     *ip)
1377 {
1378         xfs_ino_t       next_ino;
1379         xfs_mount_t     *mp;
1380         xfs_agi_t       *agi;
1381         xfs_dinode_t    *dip;
1382         xfs_buf_t       *agibp;
1383         xfs_buf_t       *ibp;
1384         xfs_agnumber_t  agno;
1385         xfs_agino_t     agino;
1386         xfs_agino_t     next_agino;
1387         xfs_buf_t       *last_ibp;
1388         xfs_dinode_t    *last_dip = NULL;
1389         short           bucket_index;
1390         int             offset, last_offset = 0;
1391         int             error;
1392
1393         mp = tp->t_mountp;
1394         agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1395
1396         /*
1397          * Get the agi buffer first.  It ensures lock ordering
1398          * on the list.
1399          */
1400         error = xfs_read_agi(mp, tp, agno, &agibp);
1401         if (error)
1402                 return error;
1403
1404         agi = XFS_BUF_TO_AGI(agibp);
1405
1406         /*
1407          * Get the index into the agi hash table for the
1408          * list this inode will go on.
1409          */
1410         agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1411         ASSERT(agino != 0);
1412         bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1413         ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
1414         ASSERT(agi->agi_unlinked[bucket_index]);
1415
1416         if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
1417                 /*
1418                  * We're at the head of the list.  Get the inode's
1419                  * on-disk buffer to see if there is anyone after us
1420                  * on the list.  Only modify our next pointer if it
1421                  * is not already NULLAGINO.  This saves us the overhead
1422                  * of dealing with the buffer when there is no need to
1423                  * change it.
1424                  */
1425                 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
1426                 if (error) {
1427                         xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
1428                                 __func__, error);
1429                         return error;
1430                 }
1431                 next_agino = be32_to_cpu(dip->di_next_unlinked);
1432                 ASSERT(next_agino != 0);
1433                 if (next_agino != NULLAGINO) {
1434                         dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1435                         offset = ip->i_imap.im_boffset +
1436                                 offsetof(xfs_dinode_t, di_next_unlinked);
1437                         xfs_trans_inode_buf(tp, ibp);
1438                         xfs_trans_log_buf(tp, ibp, offset,
1439                                           (offset + sizeof(xfs_agino_t) - 1));
1440                         xfs_inobp_check(mp, ibp);
1441                 } else {
1442                         xfs_trans_brelse(tp, ibp);
1443                 }
1444                 /*
1445                  * Point the bucket head pointer at the next inode.
1446                  */
1447                 ASSERT(next_agino != 0);
1448                 ASSERT(next_agino != agino);
1449                 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
1450                 offset = offsetof(xfs_agi_t, agi_unlinked) +
1451                         (sizeof(xfs_agino_t) * bucket_index);
1452                 xfs_trans_log_buf(tp, agibp, offset,
1453                                   (offset + sizeof(xfs_agino_t) - 1));
1454         } else {
1455                 /*
1456                  * We need to search the list for the inode being freed.
1457                  */
1458                 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1459                 last_ibp = NULL;
1460                 while (next_agino != agino) {
1461                         /*
1462                          * If the last inode wasn't the one pointing to
1463                          * us, then release its buffer since we're not
1464                          * going to do anything with it.
1465                          */
1466                         if (last_ibp != NULL) {
1467                                 xfs_trans_brelse(tp, last_ibp);
1468                         }
1469                         next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
1470                         error = xfs_inotobp(mp, tp, next_ino, &last_dip,
1471                                             &last_ibp, &last_offset, 0);
1472                         if (error) {
1473                                 xfs_warn(mp,
1474                                         "%s: xfs_inotobp() returned error %d.",
1475                                         __func__, error);
1476                                 return error;
1477                         }
1478                         next_agino = be32_to_cpu(last_dip->di_next_unlinked);
1479                         ASSERT(next_agino != NULLAGINO);
1480                         ASSERT(next_agino != 0);
1481                 }
1482                 /*
1483                  * Now last_ibp points to the buffer previous to us on
1484                  * the unlinked list.  Pull us from the list.
1485                  */
1486                 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
1487                 if (error) {
1488                         xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
1489                                 __func__, error);
1490                         return error;
1491                 }
1492                 next_agino = be32_to_cpu(dip->di_next_unlinked);
1493                 ASSERT(next_agino != 0);
1494                 ASSERT(next_agino != agino);
1495                 if (next_agino != NULLAGINO) {
1496                         dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1497                         offset = ip->i_imap.im_boffset +
1498                                 offsetof(xfs_dinode_t, di_next_unlinked);
1499                         xfs_trans_inode_buf(tp, ibp);
1500                         xfs_trans_log_buf(tp, ibp, offset,
1501                                           (offset + sizeof(xfs_agino_t) - 1));
1502                         xfs_inobp_check(mp, ibp);
1503                 } else {
1504                         xfs_trans_brelse(tp, ibp);
1505                 }
1506                 /*
1507                  * Point the previous inode on the list to the next inode.
1508                  */
1509                 last_dip->di_next_unlinked = cpu_to_be32(next_agino);
1510                 ASSERT(next_agino != 0);
1511                 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
1512                 xfs_trans_inode_buf(tp, last_ibp);
1513                 xfs_trans_log_buf(tp, last_ibp, offset,
1514                                   (offset + sizeof(xfs_agino_t) - 1));
1515                 xfs_inobp_check(mp, last_ibp);
1516         }
1517         return 0;
1518 }
1519
1520 /*
1521  * A big issue when freeing the inode cluster is is that we _cannot_ skip any
1522  * inodes that are in memory - they all must be marked stale and attached to
1523  * the cluster buffer.
1524  */
1525 STATIC int
1526 xfs_ifree_cluster(
1527         xfs_inode_t     *free_ip,
1528         xfs_trans_t     *tp,
1529         xfs_ino_t       inum)
1530 {
1531         xfs_mount_t             *mp = free_ip->i_mount;
1532         int                     blks_per_cluster;
1533         int                     nbufs;
1534         int                     ninodes;
1535         int                     i, j;
1536         xfs_daddr_t             blkno;
1537         xfs_buf_t               *bp;
1538         xfs_inode_t             *ip;
1539         xfs_inode_log_item_t    *iip;
1540         xfs_log_item_t          *lip;
1541         struct xfs_perag        *pag;
1542
1543         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
1544         if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
1545                 blks_per_cluster = 1;
1546                 ninodes = mp->m_sb.sb_inopblock;
1547                 nbufs = XFS_IALLOC_BLOCKS(mp);
1548         } else {
1549                 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
1550                                         mp->m_sb.sb_blocksize;
1551                 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
1552                 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
1553         }
1554
1555         for (j = 0; j < nbufs; j++, inum += ninodes) {
1556                 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1557                                          XFS_INO_TO_AGBNO(mp, inum));
1558
1559                 /*
1560                  * We obtain and lock the backing buffer first in the process
1561                  * here, as we have to ensure that any dirty inode that we
1562                  * can't get the flush lock on is attached to the buffer.
1563                  * If we scan the in-memory inodes first, then buffer IO can
1564                  * complete before we get a lock on it, and hence we may fail
1565                  * to mark all the active inodes on the buffer stale.
1566                  */
1567                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1568                                         mp->m_bsize * blks_per_cluster, 0);
1569
1570                 if (!bp)
1571                         return ENOMEM;
1572                 /*
1573                  * Walk the inodes already attached to the buffer and mark them
1574                  * stale. These will all have the flush locks held, so an
1575                  * in-memory inode walk can't lock them. By marking them all
1576                  * stale first, we will not attempt to lock them in the loop
1577                  * below as the XFS_ISTALE flag will be set.
1578                  */
1579                 lip = bp->b_fspriv;
1580                 while (lip) {
1581                         if (lip->li_type == XFS_LI_INODE) {
1582                                 iip = (xfs_inode_log_item_t *)lip;
1583                                 ASSERT(iip->ili_logged == 1);
1584                                 lip->li_cb = xfs_istale_done;
1585                                 xfs_trans_ail_copy_lsn(mp->m_ail,
1586                                                         &iip->ili_flush_lsn,
1587                                                         &iip->ili_item.li_lsn);
1588                                 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1589                         }
1590                         lip = lip->li_bio_list;
1591                 }
1592
1593
1594                 /*
1595                  * For each inode in memory attempt to add it to the inode
1596                  * buffer and set it up for being staled on buffer IO
1597                  * completion.  This is safe as we've locked out tail pushing
1598                  * and flushing by locking the buffer.
1599                  *
1600                  * We have already marked every inode that was part of a
1601                  * transaction stale above, which means there is no point in
1602                  * even trying to lock them.
1603                  */
1604                 for (i = 0; i < ninodes; i++) {
1605 retry:
1606                         rcu_read_lock();
1607                         ip = radix_tree_lookup(&pag->pag_ici_root,
1608                                         XFS_INO_TO_AGINO(mp, (inum + i)));
1609
1610                         /* Inode not in memory, nothing to do */
1611                         if (!ip) {
1612                                 rcu_read_unlock();
1613                                 continue;
1614                         }
1615
1616                         /*
1617                          * because this is an RCU protected lookup, we could
1618                          * find a recently freed or even reallocated inode
1619                          * during the lookup. We need to check under the
1620                          * i_flags_lock for a valid inode here. Skip it if it
1621                          * is not valid, the wrong inode or stale.
1622                          */
1623                         spin_lock(&ip->i_flags_lock);
1624                         if (ip->i_ino != inum + i ||
1625                             __xfs_iflags_test(ip, XFS_ISTALE)) {
1626                                 spin_unlock(&ip->i_flags_lock);
1627                                 rcu_read_unlock();
1628                                 continue;
1629                         }
1630                         spin_unlock(&ip->i_flags_lock);
1631
1632                         /*
1633                          * Don't try to lock/unlock the current inode, but we
1634                          * _cannot_ skip the other inodes that we did not find
1635                          * in the list attached to the buffer and are not
1636                          * already marked stale. If we can't lock it, back off
1637                          * and retry.
1638                          */
1639                         if (ip != free_ip &&
1640                             !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1641                                 rcu_read_unlock();
1642                                 delay(1);
1643                                 goto retry;
1644                         }
1645                         rcu_read_unlock();
1646
1647                         xfs_iflock(ip);
1648                         xfs_iflags_set(ip, XFS_ISTALE);
1649
1650                         /*
1651                          * we don't need to attach clean inodes or those only
1652                          * with unlogged changes (which we throw away, anyway).
1653                          */
1654                         iip = ip->i_itemp;
1655                         if (!iip || xfs_inode_clean(ip)) {
1656                                 ASSERT(ip != free_ip);
1657                                 xfs_ifunlock(ip);
1658                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1659                                 continue;
1660                         }
1661
1662                         iip->ili_last_fields = iip->ili_fields;
1663                         iip->ili_fields = 0;
1664                         iip->ili_logged = 1;
1665                         xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
1666                                                 &iip->ili_item.li_lsn);
1667
1668                         xfs_buf_attach_iodone(bp, xfs_istale_done,
1669                                                   &iip->ili_item);
1670
1671                         if (ip != free_ip)
1672                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1673                 }
1674
1675                 xfs_trans_stale_inode_buf(tp, bp);
1676                 xfs_trans_binval(tp, bp);
1677         }
1678
1679         xfs_perag_put(pag);
1680         return 0;
1681 }
1682
1683 /*
1684  * This is called to return an inode to the inode free list.
1685  * The inode should already be truncated to 0 length and have
1686  * no pages associated with it.  This routine also assumes that
1687  * the inode is already a part of the transaction.
1688  *
1689  * The on-disk copy of the inode will have been added to the list
1690  * of unlinked inodes in the AGI. We need to remove the inode from
1691  * that list atomically with respect to freeing it here.
1692  */
1693 int
1694 xfs_ifree(
1695         xfs_trans_t     *tp,
1696         xfs_inode_t     *ip,
1697         xfs_bmap_free_t *flist)
1698 {
1699         int                     error;
1700         int                     delete;
1701         xfs_ino_t               first_ino;
1702         xfs_dinode_t            *dip;
1703         xfs_buf_t               *ibp;
1704
1705         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1706         ASSERT(ip->i_d.di_nlink == 0);
1707         ASSERT(ip->i_d.di_nextents == 0);
1708         ASSERT(ip->i_d.di_anextents == 0);
1709         ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
1710         ASSERT(ip->i_d.di_nblocks == 0);
1711
1712         /*
1713          * Pull the on-disk inode from the AGI unlinked list.
1714          */
1715         error = xfs_iunlink_remove(tp, ip);
1716         if (error != 0) {
1717                 return error;
1718         }
1719
1720         error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
1721         if (error != 0) {
1722                 return error;
1723         }
1724         ip->i_d.di_mode = 0;            /* mark incore inode as free */
1725         ip->i_d.di_flags = 0;
1726         ip->i_d.di_dmevmask = 0;
1727         ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
1728         ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1729         ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1730         /*
1731          * Bump the generation count so no one will be confused
1732          * by reincarnations of this inode.
1733          */
1734         ip->i_d.di_gen++;
1735
1736         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1737
1738         error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0);
1739         if (error)
1740                 return error;
1741
1742         /*
1743         * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
1744         * from picking up this inode when it is reclaimed (its incore state
1745         * initialzed but not flushed to disk yet). The in-core di_mode is
1746         * already cleared  and a corresponding transaction logged.
1747         * The hack here just synchronizes the in-core to on-disk
1748         * di_mode value in advance before the actual inode sync to disk.
1749         * This is OK because the inode is already unlinked and would never
1750         * change its di_mode again for this inode generation.
1751         * This is a temporary hack that would require a proper fix
1752         * in the future.
1753         */
1754         dip->di_mode = 0;
1755
1756         if (delete) {
1757                 error = xfs_ifree_cluster(ip, tp, first_ino);
1758         }
1759
1760         return error;
1761 }
1762
1763 /*
1764  * Reallocate the space for if_broot based on the number of records
1765  * being added or deleted as indicated in rec_diff.  Move the records
1766  * and pointers in if_broot to fit the new size.  When shrinking this
1767  * will eliminate holes between the records and pointers created by
1768  * the caller.  When growing this will create holes to be filled in
1769  * by the caller.
1770  *
1771  * The caller must not request to add more records than would fit in
1772  * the on-disk inode root.  If the if_broot is currently NULL, then
1773  * if we adding records one will be allocated.  The caller must also
1774  * not request that the number of records go below zero, although
1775  * it can go to zero.
1776  *
1777  * ip -- the inode whose if_broot area is changing
1778  * ext_diff -- the change in the number of records, positive or negative,
1779  *       requested for the if_broot array.
1780  */
1781 void
1782 xfs_iroot_realloc(
1783         xfs_inode_t             *ip,
1784         int                     rec_diff,
1785         int                     whichfork)
1786 {
1787         struct xfs_mount        *mp = ip->i_mount;
1788         int                     cur_max;
1789         xfs_ifork_t             *ifp;
1790         struct xfs_btree_block  *new_broot;
1791         int                     new_max;
1792         size_t                  new_size;
1793         char                    *np;
1794         char                    *op;
1795
1796         /*
1797          * Handle the degenerate case quietly.
1798          */
1799         if (rec_diff == 0) {
1800                 return;
1801         }
1802
1803         ifp = XFS_IFORK_PTR(ip, whichfork);
1804         if (rec_diff > 0) {
1805                 /*
1806                  * If there wasn't any memory allocated before, just
1807                  * allocate it now and get out.
1808                  */
1809                 if (ifp->if_broot_bytes == 0) {
1810                         new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
1811                         ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
1812                         ifp->if_broot_bytes = (int)new_size;
1813                         return;
1814                 }
1815
1816                 /*
1817                  * If there is already an existing if_broot, then we need
1818                  * to realloc() it and shift the pointers to their new
1819                  * location.  The records don't change location because
1820                  * they are kept butted up against the btree block header.
1821                  */
1822                 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
1823                 new_max = cur_max + rec_diff;
1824                 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
1825                 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
1826                                 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
1827                                 KM_SLEEP | KM_NOFS);
1828                 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
1829                                                      ifp->if_broot_bytes);
1830                 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
1831                                                      (int)new_size);
1832                 ifp->if_broot_bytes = (int)new_size;
1833                 ASSERT(ifp->if_broot_bytes <=
1834                         XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
1835                 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
1836                 return;
1837         }
1838
1839         /*
1840          * rec_diff is less than 0.  In this case, we are shrinking the
1841          * if_broot buffer.  It must already exist.  If we go to zero
1842          * records, just get rid of the root and clear the status bit.
1843          */
1844         ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
1845         cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
1846         new_max = cur_max + rec_diff;
1847         ASSERT(new_max >= 0);
1848         if (new_max > 0)
1849                 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
1850         else
1851                 new_size = 0;
1852         if (new_size > 0) {
1853                 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
1854                 /*
1855                  * First copy over the btree block header.
1856                  */
1857                 memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
1858         } else {
1859                 new_broot = NULL;
1860                 ifp->if_flags &= ~XFS_IFBROOT;
1861         }
1862
1863         /*
1864          * Only copy the records and pointers if there are any.
1865          */
1866         if (new_max > 0) {
1867                 /*
1868                  * First copy the records.
1869                  */
1870                 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
1871                 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
1872                 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
1873
1874                 /*
1875                  * Then copy the pointers.
1876                  */
1877                 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
1878                                                      ifp->if_broot_bytes);
1879                 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
1880                                                      (int)new_size);
1881                 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
1882         }
1883         kmem_free(ifp->if_broot);
1884         ifp->if_broot = new_broot;
1885         ifp->if_broot_bytes = (int)new_size;
1886         ASSERT(ifp->if_broot_bytes <=
1887                 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
1888         return;
1889 }
1890
1891
1892 /*
1893  * This is called when the amount of space needed for if_data
1894  * is increased or decreased.  The change in size is indicated by
1895  * the number of bytes that need to be added or deleted in the
1896  * byte_diff parameter.
1897  *
1898  * If the amount of space needed has decreased below the size of the
1899  * inline buffer, then switch to using the inline buffer.  Otherwise,
1900  * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
1901  * to what is needed.
1902  *
1903  * ip -- the inode whose if_data area is changing
1904  * byte_diff -- the change in the number of bytes, positive or negative,
1905  *       requested for the if_data array.
1906  */
1907 void
1908 xfs_idata_realloc(
1909         xfs_inode_t     *ip,
1910         int             byte_diff,
1911         int             whichfork)
1912 {
1913         xfs_ifork_t     *ifp;
1914         int             new_size;
1915         int             real_size;
1916
1917         if (byte_diff == 0) {
1918                 return;
1919         }
1920
1921         ifp = XFS_IFORK_PTR(ip, whichfork);
1922         new_size = (int)ifp->if_bytes + byte_diff;
1923         ASSERT(new_size >= 0);
1924
1925         if (new_size == 0) {
1926                 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
1927                         kmem_free(ifp->if_u1.if_data);
1928                 }
1929                 ifp->if_u1.if_data = NULL;
1930                 real_size = 0;
1931         } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
1932                 /*
1933                  * If the valid extents/data can fit in if_inline_ext/data,
1934                  * copy them from the malloc'd vector and free it.
1935                  */
1936                 if (ifp->if_u1.if_data == NULL) {
1937                         ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
1938                 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
1939                         ASSERT(ifp->if_real_bytes != 0);
1940                         memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
1941                               new_size);
1942                         kmem_free(ifp->if_u1.if_data);
1943                         ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
1944                 }
1945                 real_size = 0;
1946         } else {
1947                 /*
1948                  * Stuck with malloc/realloc.
1949                  * For inline data, the underlying buffer must be
1950                  * a multiple of 4 bytes in size so that it can be
1951                  * logged and stay on word boundaries.  We enforce
1952                  * that here.
1953                  */
1954                 real_size = roundup(new_size, 4);
1955                 if (ifp->if_u1.if_data == NULL) {
1956                         ASSERT(ifp->if_real_bytes == 0);
1957                         ifp->if_u1.if_data = kmem_alloc(real_size,
1958                                                         KM_SLEEP | KM_NOFS);
1959                 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
1960                         /*
1961                          * Only do the realloc if the underlying size
1962                          * is really changing.
1963                          */
1964                         if (ifp->if_real_bytes != real_size) {
1965                                 ifp->if_u1.if_data =
1966                                         kmem_realloc(ifp->if_u1.if_data,
1967                                                         real_size,
1968                                                         ifp->if_real_bytes,
1969                                                         KM_SLEEP | KM_NOFS);
1970                         }
1971                 } else {
1972                         ASSERT(ifp->if_real_bytes == 0);
1973                         ifp->if_u1.if_data = kmem_alloc(real_size,
1974                                                         KM_SLEEP | KM_NOFS);
1975                         memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
1976                                 ifp->if_bytes);
1977                 }
1978         }
1979         ifp->if_real_bytes = real_size;
1980         ifp->if_bytes = new_size;
1981         ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
1982 }
1983
1984 void
1985 xfs_idestroy_fork(
1986         xfs_inode_t     *ip,
1987         int             whichfork)
1988 {
1989         xfs_ifork_t     *ifp;
1990
1991         ifp = XFS_IFORK_PTR(ip, whichfork);
1992         if (ifp->if_broot != NULL) {
1993                 kmem_free(ifp->if_broot);
1994                 ifp->if_broot = NULL;
1995         }
1996
1997         /*
1998          * If the format is local, then we can't have an extents
1999          * array so just look for an inline data array.  If we're
2000          * not local then we may or may not have an extents list,
2001          * so check and free it up if we do.
2002          */
2003         if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2004                 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2005                     (ifp->if_u1.if_data != NULL)) {
2006                         ASSERT(ifp->if_real_bytes != 0);
2007                         kmem_free(ifp->if_u1.if_data);
2008                         ifp->if_u1.if_data = NULL;
2009                         ifp->if_real_bytes = 0;
2010                 }
2011         } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2012                    ((ifp->if_flags & XFS_IFEXTIREC) ||
2013                     ((ifp->if_u1.if_extents != NULL) &&
2014                      (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
2015                 ASSERT(ifp->if_real_bytes != 0);
2016                 xfs_iext_destroy(ifp);
2017         }
2018         ASSERT(ifp->if_u1.if_extents == NULL ||
2019                ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2020         ASSERT(ifp->if_real_bytes == 0);
2021         if (whichfork == XFS_ATTR_FORK) {
2022                 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2023                 ip->i_afp = NULL;
2024         }
2025 }
2026
2027 /*
2028  * This is called to unpin an inode.  The caller must have the inode locked
2029  * in at least shared mode so that the buffer cannot be subsequently pinned
2030  * once someone is waiting for it to be unpinned.
2031  */
2032 static void
2033 xfs_iunpin(
2034         struct xfs_inode        *ip)
2035 {
2036         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2037
2038         trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2039
2040         /* Give the log a push to start the unpinning I/O */
2041         xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2042
2043 }
2044
2045 static void
2046 __xfs_iunpin_wait(
2047         struct xfs_inode        *ip)
2048 {
2049         wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2050         DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2051
2052         xfs_iunpin(ip);
2053
2054         do {
2055                 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2056                 if (xfs_ipincount(ip))
2057                         io_schedule();
2058         } while (xfs_ipincount(ip));
2059         finish_wait(wq, &wait.wait);
2060 }
2061
2062 void
2063 xfs_iunpin_wait(
2064         struct xfs_inode        *ip)
2065 {
2066         if (xfs_ipincount(ip))
2067                 __xfs_iunpin_wait(ip);
2068 }
2069
2070 /*
2071  * xfs_iextents_copy()
2072  *
2073  * This is called to copy the REAL extents (as opposed to the delayed
2074  * allocation extents) from the inode into the given buffer.  It
2075  * returns the number of bytes copied into the buffer.
2076  *
2077  * If there are no delayed allocation extents, then we can just
2078  * memcpy() the extents into the buffer.  Otherwise, we need to
2079  * examine each extent in turn and skip those which are delayed.
2080  */
2081 int
2082 xfs_iextents_copy(
2083         xfs_inode_t             *ip,
2084         xfs_bmbt_rec_t          *dp,
2085         int                     whichfork)
2086 {
2087         int                     copied;
2088         int                     i;
2089         xfs_ifork_t             *ifp;
2090         int                     nrecs;
2091         xfs_fsblock_t           start_block;
2092
2093         ifp = XFS_IFORK_PTR(ip, whichfork);
2094         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2095         ASSERT(ifp->if_bytes > 0);
2096
2097         nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2098         XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
2099         ASSERT(nrecs > 0);
2100
2101         /*
2102          * There are some delayed allocation extents in the
2103          * inode, so copy the extents one at a time and skip
2104          * the delayed ones.  There must be at least one
2105          * non-delayed extent.
2106          */
2107         copied = 0;
2108         for (i = 0; i < nrecs; i++) {
2109                 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
2110                 start_block = xfs_bmbt_get_startblock(ep);
2111                 if (isnullstartblock(start_block)) {
2112                         /*
2113                          * It's a delayed allocation extent, so skip it.
2114                          */
2115                         continue;
2116                 }
2117
2118                 /* Translate to on disk format */
2119                 put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
2120                 put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
2121                 dp++;
2122                 copied++;
2123         }
2124         ASSERT(copied != 0);
2125         xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
2126
2127         return (copied * (uint)sizeof(xfs_bmbt_rec_t));
2128 }
2129
2130 /*
2131  * Each of the following cases stores data into the same region
2132  * of the on-disk inode, so only one of them can be valid at
2133  * any given time. While it is possible to have conflicting formats
2134  * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2135  * in EXTENTS format, this can only happen when the fork has
2136  * changed formats after being modified but before being flushed.
2137  * In these cases, the format always takes precedence, because the
2138  * format indicates the current state of the fork.
2139  */
2140 /*ARGSUSED*/
2141 STATIC void
2142 xfs_iflush_fork(
2143         xfs_inode_t             *ip,
2144         xfs_dinode_t            *dip,
2145         xfs_inode_log_item_t    *iip,
2146         int                     whichfork,
2147         xfs_buf_t               *bp)
2148 {
2149         char                    *cp;
2150         xfs_ifork_t             *ifp;
2151         xfs_mount_t             *mp;
2152 #ifdef XFS_TRANS_DEBUG
2153         int                     first;
2154 #endif
2155         static const short      brootflag[2] =
2156                 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
2157         static const short      dataflag[2] =
2158                 { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
2159         static const short      extflag[2] =
2160                 { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2161
2162         if (!iip)
2163                 return;
2164         ifp = XFS_IFORK_PTR(ip, whichfork);
2165         /*
2166          * This can happen if we gave up in iformat in an error path,
2167          * for the attribute fork.
2168          */
2169         if (!ifp) {
2170                 ASSERT(whichfork == XFS_ATTR_FORK);
2171                 return;
2172         }
2173         cp = XFS_DFORK_PTR(dip, whichfork);
2174         mp = ip->i_mount;
2175         switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2176         case XFS_DINODE_FMT_LOCAL:
2177                 if ((iip->ili_fields & dataflag[whichfork]) &&
2178                     (ifp->if_bytes > 0)) {
2179                         ASSERT(ifp->if_u1.if_data != NULL);
2180                         ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2181                         memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
2182                 }
2183                 break;
2184
2185         case XFS_DINODE_FMT_EXTENTS:
2186                 ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2187                        !(iip->ili_fields & extflag[whichfork]));
2188                 if ((iip->ili_fields & extflag[whichfork]) &&
2189                     (ifp->if_bytes > 0)) {
2190                         ASSERT(xfs_iext_get_ext(ifp, 0));
2191                         ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
2192                         (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
2193                                 whichfork);
2194                 }
2195                 break;
2196
2197         case XFS_DINODE_FMT_BTREE:
2198                 if ((iip->ili_fields & brootflag[whichfork]) &&
2199                     (ifp->if_broot_bytes > 0)) {
2200                         ASSERT(ifp->if_broot != NULL);
2201                         ASSERT(ifp->if_broot_bytes <=
2202                                (XFS_IFORK_SIZE(ip, whichfork) +
2203                                 XFS_BROOT_SIZE_ADJ));
2204                         xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
2205                                 (xfs_bmdr_block_t *)cp,
2206                                 XFS_DFORK_SIZE(dip, mp, whichfork));
2207                 }
2208                 break;
2209
2210         case XFS_DINODE_FMT_DEV:
2211                 if (iip->ili_fields & XFS_ILOG_DEV) {
2212                         ASSERT(whichfork == XFS_DATA_FORK);
2213                         xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
2214                 }
2215                 break;
2216
2217         case XFS_DINODE_FMT_UUID:
2218                 if (iip->ili_fields & XFS_ILOG_UUID) {
2219                         ASSERT(whichfork == XFS_DATA_FORK);
2220                         memcpy(XFS_DFORK_DPTR(dip),
2221                                &ip->i_df.if_u2.if_uuid,
2222                                sizeof(uuid_t));
2223                 }
2224                 break;
2225
2226         default:
2227                 ASSERT(0);
2228                 break;
2229         }
2230 }
2231
2232 STATIC int
2233 xfs_iflush_cluster(
2234         xfs_inode_t     *ip,
2235         xfs_buf_t       *bp)
2236 {
2237         xfs_mount_t             *mp = ip->i_mount;
2238         struct xfs_perag        *pag;
2239         unsigned long           first_index, mask;
2240         unsigned long           inodes_per_cluster;
2241         int                     ilist_size;
2242         xfs_inode_t             **ilist;
2243         xfs_inode_t             *iq;
2244         int                     nr_found;
2245         int                     clcount = 0;
2246         int                     bufwasdelwri;
2247         int                     i;
2248
2249         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2250
2251         inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
2252         ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2253         ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2254         if (!ilist)
2255                 goto out_put;
2256
2257         mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2258         first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2259         rcu_read_lock();
2260         /* really need a gang lookup range call here */
2261         nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2262                                         first_index, inodes_per_cluster);
2263         if (nr_found == 0)
2264                 goto out_free;
2265
2266         for (i = 0; i < nr_found; i++) {
2267                 iq = ilist[i];
2268                 if (iq == ip)
2269                         continue;
2270
2271                 /*
2272                  * because this is an RCU protected lookup, we could find a
2273                  * recently freed or even reallocated inode during the lookup.
2274                  * We need to check under the i_flags_lock for a valid inode
2275                  * here. Skip it if it is not valid or the wrong inode.
2276                  */
2277                 spin_lock(&ip->i_flags_lock);
2278                 if (!ip->i_ino ||
2279                     (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2280                         spin_unlock(&ip->i_flags_lock);
2281                         continue;
2282                 }
2283                 spin_unlock(&ip->i_flags_lock);
2284
2285                 /*
2286                  * Do an un-protected check to see if the inode is dirty and
2287                  * is a candidate for flushing.  These checks will be repeated
2288                  * later after the appropriate locks are acquired.
2289                  */
2290                 if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
2291                         continue;
2292
2293                 /*
2294                  * Try to get locks.  If any are unavailable or it is pinned,
2295                  * then this inode cannot be flushed and is skipped.
2296                  */
2297
2298                 if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
2299                         continue;
2300                 if (!xfs_iflock_nowait(iq)) {
2301                         xfs_iunlock(iq, XFS_ILOCK_SHARED);
2302                         continue;
2303                 }
2304                 if (xfs_ipincount(iq)) {
2305                         xfs_ifunlock(iq);
2306                         xfs_iunlock(iq, XFS_ILOCK_SHARED);
2307                         continue;
2308                 }
2309
2310                 /*
2311                  * arriving here means that this inode can be flushed.  First
2312                  * re-check that it's dirty before flushing.
2313                  */
2314                 if (!xfs_inode_clean(iq)) {
2315                         int     error;
2316                         error = xfs_iflush_int(iq, bp);
2317                         if (error) {
2318                                 xfs_iunlock(iq, XFS_ILOCK_SHARED);
2319                                 goto cluster_corrupt_out;
2320                         }
2321                         clcount++;
2322                 } else {
2323                         xfs_ifunlock(iq);
2324                 }
2325                 xfs_iunlock(iq, XFS_ILOCK_SHARED);
2326         }
2327
2328         if (clcount) {
2329                 XFS_STATS_INC(xs_icluster_flushcnt);
2330                 XFS_STATS_ADD(xs_icluster_flushinode, clcount);
2331         }
2332
2333 out_free:
2334         rcu_read_unlock();
2335         kmem_free(ilist);
2336 out_put:
2337         xfs_perag_put(pag);
2338         return 0;
2339
2340
2341 cluster_corrupt_out:
2342         /*
2343          * Corruption detected in the clustering loop.  Invalidate the
2344          * inode buffer and shut down the filesystem.
2345          */
2346         rcu_read_unlock();
2347         /*
2348          * Clean up the buffer.  If it was delwri, just release it --
2349          * brelse can handle it with no problems.  If not, shut down the
2350          * filesystem before releasing the buffer.
2351          */
2352         bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
2353         if (bufwasdelwri)
2354                 xfs_buf_relse(bp);
2355
2356         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2357
2358         if (!bufwasdelwri) {
2359                 /*
2360                  * Just like incore_relse: if we have b_iodone functions,
2361                  * mark the buffer as an error and call them.  Otherwise
2362                  * mark it as stale and brelse.
2363                  */
2364                 if (bp->b_iodone) {
2365                         XFS_BUF_UNDONE(bp);
2366                         xfs_buf_stale(bp);
2367                         xfs_buf_ioerror(bp, EIO);
2368                         xfs_buf_ioend(bp, 0);
2369                 } else {
2370                         xfs_buf_stale(bp);
2371                         xfs_buf_relse(bp);
2372                 }
2373         }
2374
2375         /*
2376          * Unlocks the flush lock
2377          */
2378         xfs_iflush_abort(iq, false);
2379         kmem_free(ilist);
2380         xfs_perag_put(pag);
2381         return XFS_ERROR(EFSCORRUPTED);
2382 }
2383
2384 /*
2385  * Flush dirty inode metadata into the backing buffer.
2386  *
2387  * The caller must have the inode lock and the inode flush lock held.  The
2388  * inode lock will still be held upon return to the caller, and the inode
2389  * flush lock will be released after the inode has reached the disk.
2390  *
2391  * The caller must write out the buffer returned in *bpp and release it.
2392  */
2393 int
2394 xfs_iflush(
2395         struct xfs_inode        *ip,
2396         struct xfs_buf          **bpp)
2397 {
2398         struct xfs_mount        *mp = ip->i_mount;
2399         struct xfs_buf          *bp;
2400         struct xfs_dinode       *dip;
2401         int                     error;
2402
2403         XFS_STATS_INC(xs_iflush_count);
2404
2405         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2406         ASSERT(xfs_isiflocked(ip));
2407         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2408                ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2409
2410         *bpp = NULL;
2411
2412         xfs_iunpin_wait(ip);
2413
2414         /*
2415          * For stale inodes we cannot rely on the backing buffer remaining
2416          * stale in cache for the remaining life of the stale inode and so
2417          * xfs_itobp() below may give us a buffer that no longer contains
2418          * inodes below. We have to check this after ensuring the inode is
2419          * unpinned so that it is safe to reclaim the stale inode after the
2420          * flush call.
2421          */
2422         if (xfs_iflags_test(ip, XFS_ISTALE)) {
2423                 xfs_ifunlock(ip);
2424                 return 0;
2425         }
2426
2427         /*
2428          * This may have been unpinned because the filesystem is shutting
2429          * down forcibly. If that's the case we must not write this inode
2430          * to disk, because the log record didn't make it to disk.
2431          *
2432          * We also have to remove the log item from the AIL in this case,
2433          * as we wait for an empty AIL as part of the unmount process.
2434          */
2435         if (XFS_FORCED_SHUTDOWN(mp)) {
2436                 error = XFS_ERROR(EIO);
2437                 goto abort_out;
2438         }
2439
2440         /*
2441          * Get the buffer containing the on-disk inode.
2442          */
2443         error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK);
2444         if (error || !bp) {
2445                 xfs_ifunlock(ip);
2446                 return error;
2447         }
2448
2449         /*
2450          * First flush out the inode that xfs_iflush was called with.
2451          */
2452         error = xfs_iflush_int(ip, bp);
2453         if (error)
2454                 goto corrupt_out;
2455
2456         /*
2457          * If the buffer is pinned then push on the log now so we won't
2458          * get stuck waiting in the write for too long.
2459          */
2460         if (xfs_buf_ispinned(bp))
2461                 xfs_log_force(mp, 0);
2462
2463         /*
2464          * inode clustering:
2465          * see if other inodes can be gathered into this write
2466          */
2467         error = xfs_iflush_cluster(ip, bp);
2468         if (error)
2469                 goto cluster_corrupt_out;
2470
2471         *bpp = bp;
2472         return 0;
2473
2474 corrupt_out:
2475         xfs_buf_relse(bp);
2476         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2477 cluster_corrupt_out:
2478         error = XFS_ERROR(EFSCORRUPTED);
2479 abort_out:
2480         /*
2481          * Unlocks the flush lock
2482          */
2483         xfs_iflush_abort(ip, false);
2484         return error;
2485 }
2486
2487
2488 STATIC int
2489 xfs_iflush_int(
2490         xfs_inode_t             *ip,
2491         xfs_buf_t               *bp)
2492 {
2493         xfs_inode_log_item_t    *iip;
2494         xfs_dinode_t            *dip;
2495         xfs_mount_t             *mp;
2496 #ifdef XFS_TRANS_DEBUG
2497         int                     first;
2498 #endif
2499
2500         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2501         ASSERT(xfs_isiflocked(ip));
2502         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2503                ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2504
2505         iip = ip->i_itemp;
2506         mp = ip->i_mount;
2507
2508         /* set *dip = inode's place in the buffer */
2509         dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
2510
2511         if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
2512                                mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
2513                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2514                         "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
2515                         __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
2516                 goto corrupt_out;
2517         }
2518         if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
2519                                 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
2520                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2521                         "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
2522                         __func__, ip->i_ino, ip, ip->i_d.di_magic);
2523                 goto corrupt_out;
2524         }
2525         if (S_ISREG(ip->i_d.di_mode)) {
2526                 if (XFS_TEST_ERROR(
2527                     (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2528                     (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
2529                     mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
2530                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2531                                 "%s: Bad regular inode %Lu, ptr 0x%p",
2532                                 __func__, ip->i_ino, ip);
2533                         goto corrupt_out;
2534                 }
2535         } else if (S_ISDIR(ip->i_d.di_mode)) {
2536                 if (XFS_TEST_ERROR(
2537                     (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2538                     (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
2539                     (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
2540                     mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
2541                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2542                                 "%s: Bad directory inode %Lu, ptr 0x%p",
2543                                 __func__, ip->i_ino, ip);
2544                         goto corrupt_out;
2545                 }
2546         }
2547         if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
2548                                 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
2549                                 XFS_RANDOM_IFLUSH_5)) {
2550                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2551                         "%s: detected corrupt incore inode %Lu, "
2552                         "total extents = %d, nblocks = %Ld, ptr 0x%p",
2553                         __func__, ip->i_ino,
2554                         ip->i_d.di_nextents + ip->i_d.di_anextents,
2555                         ip->i_d.di_nblocks, ip);
2556                 goto corrupt_out;
2557         }
2558         if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
2559                                 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
2560                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2561                         "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
2562                         __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2563                 goto corrupt_out;
2564         }
2565         /*
2566          * bump the flush iteration count, used to detect flushes which
2567          * postdate a log record during recovery.
2568          */
2569
2570         ip->i_d.di_flushiter++;
2571
2572         /*
2573          * Copy the dirty parts of the inode into the on-disk
2574          * inode.  We always copy out the core of the inode,
2575          * because if the inode is dirty at all the core must
2576          * be.
2577          */
2578         xfs_dinode_to_disk(dip, &ip->i_d);
2579
2580         /* Wrap, we never let the log put out DI_MAX_FLUSH */
2581         if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
2582                 ip->i_d.di_flushiter = 0;
2583
2584         /*
2585          * If this is really an old format inode and the superblock version
2586          * has not been updated to support only new format inodes, then
2587          * convert back to the old inode format.  If the superblock version
2588          * has been updated, then make the conversion permanent.
2589          */
2590         ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
2591         if (ip->i_d.di_version == 1) {
2592                 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
2593                         /*
2594                          * Convert it back.
2595                          */
2596                         ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
2597                         dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
2598                 } else {
2599                         /*
2600                          * The superblock version has already been bumped,
2601                          * so just make the conversion to the new inode
2602                          * format permanent.
2603                          */
2604                         ip->i_d.di_version = 2;
2605                         dip->di_version = 2;
2606                         ip->i_d.di_onlink = 0;
2607                         dip->di_onlink = 0;
2608                         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
2609                         memset(&(dip->di_pad[0]), 0,
2610                               sizeof(dip->di_pad));
2611                         ASSERT(xfs_get_projid(ip) == 0);
2612                 }
2613         }
2614
2615         xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
2616         if (XFS_IFORK_Q(ip))
2617                 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
2618         xfs_inobp_check(mp, bp);
2619
2620         /*
2621          * We've recorded everything logged in the inode, so we'd like to clear
2622          * the ili_fields bits so we don't log and flush things unnecessarily.
2623          * However, we can't stop logging all this information until the data
2624          * we've copied into the disk buffer is written to disk.  If we did we
2625          * might overwrite the copy of the inode in the log with all the data
2626          * after re-logging only part of it, and in the face of a crash we
2627          * wouldn't have all the data we need to recover.
2628          *
2629          * What we do is move the bits to the ili_last_fields field.  When
2630          * logging the inode, these bits are moved back to the ili_fields field.
2631          * In the xfs_iflush_done() routine we clear ili_last_fields, since we
2632          * know that the information those bits represent is permanently on
2633          * disk.  As long as the flush completes before the inode is logged
2634          * again, then both ili_fields and ili_last_fields will be cleared.
2635          *
2636          * We can play with the ili_fields bits here, because the inode lock
2637          * must be held exclusively in order to set bits there and the flush
2638          * lock protects the ili_last_fields bits.  Set ili_logged so the flush
2639          * done routine can tell whether or not to look in the AIL.  Also, store
2640          * the current LSN of the inode so that we can tell whether the item has
2641          * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
2642          * need the AIL lock, because it is a 64 bit value that cannot be read
2643          * atomically.
2644          */
2645         if (iip != NULL && iip->ili_fields != 0) {
2646                 iip->ili_last_fields = iip->ili_fields;
2647                 iip->ili_fields = 0;
2648                 iip->ili_logged = 1;
2649
2650                 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2651                                         &iip->ili_item.li_lsn);
2652
2653                 /*
2654                  * Attach the function xfs_iflush_done to the inode's
2655                  * buffer.  This will remove the inode from the AIL
2656                  * and unlock the inode's flush lock when the inode is
2657                  * completely written to disk.
2658                  */
2659                 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
2660
2661                 ASSERT(bp->b_fspriv != NULL);
2662                 ASSERT(bp->b_iodone != NULL);
2663         } else {
2664                 /*
2665                  * We're flushing an inode which is not in the AIL and has
2666                  * not been logged.  For this case we can immediately drop
2667                  * the inode flush lock because we can avoid the whole
2668                  * AIL state thing.  It's OK to drop the flush lock now,
2669                  * because we've already locked the buffer and to do anything
2670                  * you really need both.
2671                  */
2672                 if (iip != NULL) {
2673                         ASSERT(iip->ili_logged == 0);
2674                         ASSERT(iip->ili_last_fields == 0);
2675                         ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
2676                 }
2677                 xfs_ifunlock(ip);
2678         }
2679
2680         return 0;
2681
2682 corrupt_out:
2683         return XFS_ERROR(EFSCORRUPTED);
2684 }
2685
2686 /*
2687  * Return a pointer to the extent record at file index idx.
2688  */
2689 xfs_bmbt_rec_host_t *
2690 xfs_iext_get_ext(
2691         xfs_ifork_t     *ifp,           /* inode fork pointer */
2692         xfs_extnum_t    idx)            /* index of target extent */
2693 {
2694         ASSERT(idx >= 0);
2695         ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
2696
2697         if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
2698                 return ifp->if_u1.if_ext_irec->er_extbuf;
2699         } else if (ifp->if_flags & XFS_IFEXTIREC) {
2700                 xfs_ext_irec_t  *erp;           /* irec pointer */
2701                 int             erp_idx = 0;    /* irec index */
2702                 xfs_extnum_t    page_idx = idx; /* ext index in target list */
2703
2704                 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
2705                 return &erp->er_extbuf[page_idx];
2706         } else if (ifp->if_bytes) {
2707                 return &ifp->if_u1.if_extents[idx];
2708         } else {
2709                 return NULL;
2710         }
2711 }
2712
2713 /*
2714  * Insert new item(s) into the extent records for incore inode
2715  * fork 'ifp'.  'count' new items are inserted at index 'idx'.
2716  */
2717 void
2718 xfs_iext_insert(
2719         xfs_inode_t     *ip,            /* incore inode pointer */
2720         xfs_extnum_t    idx,            /* starting index of new items */
2721         xfs_extnum_t    count,          /* number of inserted items */
2722         xfs_bmbt_irec_t *new,           /* items to insert */
2723         int             state)          /* type of extent conversion */
2724 {
2725         xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
2726         xfs_extnum_t    i;              /* extent record index */
2727
2728         trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
2729
2730         ASSERT(ifp->if_flags & XFS_IFEXTENTS);
2731         xfs_iext_add(ifp, idx, count);
2732         for (i = idx; i < idx + count; i++, new++)
2733                 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
2734 }
2735
2736 /*
2737  * This is called when the amount of space required for incore file
2738  * extents needs to be increased. The ext_diff parameter stores the
2739  * number of new extents being added and the idx parameter contains
2740  * the extent index where the new extents will be added. If the new
2741  * extents are being appended, then we just need to (re)allocate and
2742  * initialize the space. Otherwise, if the new extents are being
2743  * inserted into the middle of the existing entries, a bit more work
2744  * is required to make room for the new extents to be inserted. The
2745  * caller is responsible for filling in the new extent entries upon
2746  * return.
2747  */
2748 void
2749 xfs_iext_add(
2750         xfs_ifork_t     *ifp,           /* inode fork pointer */
2751         xfs_extnum_t    idx,            /* index to begin adding exts */
2752         int             ext_diff)       /* number of extents to add */
2753 {
2754         int             byte_diff;      /* new bytes being added */
2755         int             new_size;       /* size of extents after adding */
2756         xfs_extnum_t    nextents;       /* number of extents in file */
2757
2758         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2759         ASSERT((idx >= 0) && (idx <= nextents));
2760         byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
2761         new_size = ifp->if_bytes + byte_diff;
2762         /*
2763          * If the new number of extents (nextents + ext_diff)
2764          * fits inside the inode, then continue to use the inline
2765          * extent buffer.
2766          */
2767         if (nextents + ext_diff <= XFS_INLINE_EXTS) {
2768                 if (idx < nextents) {
2769                         memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
2770                                 &ifp->if_u2.if_inline_ext[idx],
2771                                 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
2772                         memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
2773                 }
2774                 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
2775                 ifp->if_real_bytes = 0;
2776         }
2777         /*
2778          * Otherwise use a linear (direct) extent list.
2779          * If the extents are currently inside the inode,
2780          * xfs_iext_realloc_direct will switch us from
2781          * inline to direct extent allocation mode.
2782          */
2783         else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
2784                 xfs_iext_realloc_direct(ifp, new_size);
2785                 if (idx < nextents) {
2786                         memmove(&ifp->if_u1.if_extents[idx + ext_diff],
2787                                 &ifp->if_u1.if_extents[idx],
2788                                 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
2789                         memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
2790                 }
2791         }
2792         /* Indirection array */
2793         else {
2794                 xfs_ext_irec_t  *erp;
2795                 int             erp_idx = 0;
2796                 int             page_idx = idx;
2797
2798                 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
2799                 if (ifp->if_flags & XFS_IFEXTIREC) {
2800                         erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
2801                 } else {
2802                         xfs_iext_irec_init(ifp);
2803                         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
2804                         erp = ifp->if_u1.if_ext_irec;
2805                 }
2806                 /* Extents fit in target extent page */
2807                 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
2808                         if (page_idx < erp->er_extcount) {
2809                                 memmove(&erp->er_extbuf[page_idx + ext_diff],
2810                                         &erp->er_extbuf[page_idx],
2811                                         (erp->er_extcount - page_idx) *
2812                                         sizeof(xfs_bmbt_rec_t));
2813                                 memset(&erp->er_extbuf[page_idx], 0, byte_diff);
2814                         }
2815                         erp->er_extcount += ext_diff;
2816                         xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
2817                 }
2818                 /* Insert a new extent page */
2819                 else if (erp) {
2820                         xfs_iext_add_indirect_multi(ifp,
2821                                 erp_idx, page_idx, ext_diff);
2822                 }
2823                 /*
2824                  * If extent(s) are being appended to the last page in
2825                  * the indirection array and the new extent(s) don't fit
2826                  * in the page, then erp is NULL and erp_idx is set to
2827                  * the next index needed in the indirection array.
2828                  */
2829                 else {
2830                         int     count = ext_diff;
2831
2832                         while (count) {
2833                                 erp = xfs_iext_irec_new(ifp, erp_idx);
2834                                 erp->er_extcount = count;
2835                                 count -= MIN(count, (int)XFS_LINEAR_EXTS);
2836                                 if (count) {
2837                                         erp_idx++;
2838                                 }
2839                         }
2840                 }
2841         }
2842         ifp->if_bytes = new_size;
2843 }
2844
2845 /*
2846  * This is called when incore extents are being added to the indirection
2847  * array and the new extents do not fit in the target extent list. The
2848  * erp_idx parameter contains the irec index for the target extent list
2849  * in the indirection array, and the idx parameter contains the extent
2850  * index within the list. The number of extents being added is stored
2851  * in the count parameter.
2852  *
2853  *    |-------|   |-------|
2854  *    |       |   |       |    idx - number of extents before idx
2855  *    |  idx  |   | count |
2856  *    |       |   |       |    count - number of extents being inserted at idx
2857  *    |-------|   |-------|
2858  *    | count |   | nex2  |    nex2 - number of extents after idx + count
2859  *    |-------|   |-------|
2860  */
2861 void
2862 xfs_iext_add_indirect_multi(
2863         xfs_ifork_t     *ifp,                   /* inode fork pointer */
2864         int             erp_idx,                /* target extent irec index */
2865         xfs_extnum_t    idx,                    /* index within target list */
2866         int             count)                  /* new extents being added */
2867 {
2868         int             byte_diff;              /* new bytes being added */
2869         xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
2870         xfs_extnum_t    ext_diff;               /* number of extents to add */
2871         xfs_extnum_t    ext_cnt;                /* new extents still needed */
2872         xfs_extnum_t    nex2;                   /* extents after idx + count */
2873         xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
2874         int             nlists;                 /* number of irec's (lists) */
2875
2876         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
2877         erp = &ifp->if_u1.if_ext_irec[erp_idx];
2878         nex2 = erp->er_extcount - idx;
2879         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
2880
2881         /*
2882          * Save second part of target extent list
2883          * (all extents past */
2884         if (nex2) {
2885                 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
2886                 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
2887                 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
2888                 erp->er_extcount -= nex2;
2889                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
2890                 memset(&erp->er_extbuf[idx], 0, byte_diff);
2891         }
2892
2893         /*
2894          * Add the new extents to the end of the target
2895          * list, then allocate new irec record(s) and
2896          * extent buffer(s) as needed to store the rest
2897          * of the new extents.
2898          */
2899         ext_cnt = count;
2900         ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
2901         if (ext_diff) {
2902                 erp->er_extcount += ext_diff;
2903                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
2904                 ext_cnt -= ext_diff;
2905         }
2906         while (ext_cnt) {
2907                 erp_idx++;
2908                 erp = xfs_iext_irec_new(ifp, erp_idx);
2909                 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
2910                 erp->er_extcount = ext_diff;
2911                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
2912                 ext_cnt -= ext_diff;
2913         }
2914
2915         /* Add nex2 extents back to indirection array */
2916         if (nex2) {
2917                 xfs_extnum_t    ext_avail;
2918                 int             i;
2919
2920                 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
2921                 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
2922                 i = 0;
2923                 /*
2924                  * If nex2 extents fit in the current page, append
2925                  * nex2_ep after the new extents.
2926                  */
2927                 if (nex2 <= ext_avail) {
2928                         i = erp->er_extcount;
2929                 }
2930                 /*
2931                  * Otherwise, check if space is available in the
2932                  * next page.
2933                  */
2934                 else if ((erp_idx < nlists - 1) &&
2935                          (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
2936                           ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
2937                         erp_idx++;
2938                         erp++;
2939                         /* Create a hole for nex2 extents */
2940                         memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
2941                                 erp->er_extcount * sizeof(xfs_bmbt_rec_t));
2942                 }
2943                 /*
2944                  * Final choice, create a new extent page for
2945                  * nex2 extents.
2946                  */
2947                 else {
2948                         erp_idx++;
2949                         erp = xfs_iext_irec_new(ifp, erp_idx);
2950                 }
2951                 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
2952                 kmem_free(nex2_ep);
2953                 erp->er_extcount += nex2;
2954                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
2955         }
2956 }
2957
2958 /*
2959  * This is called when the amount of space required for incore file
2960  * extents needs to be decreased. The ext_diff parameter stores the
2961  * number of extents to be removed and the idx parameter contains
2962  * the extent index where the extents will be removed from.
2963  *
2964  * If the amount of space needed has decreased below the linear
2965  * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
2966  * extent array.  Otherwise, use kmem_realloc() to adjust the
2967  * size to what is needed.
2968  */
2969 void
2970 xfs_iext_remove(
2971         xfs_inode_t     *ip,            /* incore inode pointer */
2972         xfs_extnum_t    idx,            /* index to begin removing exts */
2973         int             ext_diff,       /* number of extents to remove */
2974         int             state)          /* type of extent conversion */
2975 {
2976         xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
2977         xfs_extnum_t    nextents;       /* number of extents in file */
2978         int             new_size;       /* size of extents after removal */
2979
2980         trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
2981
2982         ASSERT(ext_diff > 0);
2983         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2984         new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
2985
2986         if (new_size == 0) {
2987                 xfs_iext_destroy(ifp);
2988         } else if (ifp->if_flags & XFS_IFEXTIREC) {
2989                 xfs_iext_remove_indirect(ifp, idx, ext_diff);
2990         } else if (ifp->if_real_bytes) {
2991                 xfs_iext_remove_direct(ifp, idx, ext_diff);
2992         } else {
2993                 xfs_iext_remove_inline(ifp, idx, ext_diff);
2994         }
2995         ifp->if_bytes = new_size;
2996 }
2997
2998 /*
2999  * This removes ext_diff extents from the inline buffer, beginning
3000  * at extent index idx.
3001  */
3002 void
3003 xfs_iext_remove_inline(
3004         xfs_ifork_t     *ifp,           /* inode fork pointer */
3005         xfs_extnum_t    idx,            /* index to begin removing exts */
3006         int             ext_diff)       /* number of extents to remove */
3007 {
3008         int             nextents;       /* number of extents in file */
3009
3010         ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3011         ASSERT(idx < XFS_INLINE_EXTS);
3012         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3013         ASSERT(((nextents - ext_diff) > 0) &&
3014                 (nextents - ext_diff) < XFS_INLINE_EXTS);
3015
3016         if (idx + ext_diff < nextents) {
3017                 memmove(&ifp->if_u2.if_inline_ext[idx],
3018                         &ifp->if_u2.if_inline_ext[idx + ext_diff],
3019                         (nextents - (idx + ext_diff)) *
3020                          sizeof(xfs_bmbt_rec_t));
3021                 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
3022                         0, ext_diff * sizeof(xfs_bmbt_rec_t));
3023         } else {
3024                 memset(&ifp->if_u2.if_inline_ext[idx], 0,
3025                         ext_diff * sizeof(xfs_bmbt_rec_t));
3026         }
3027 }
3028
3029 /*
3030  * This removes ext_diff extents from a linear (direct) extent list,
3031  * beginning at extent index idx. If the extents are being removed
3032  * from the end of the list (ie. truncate) then we just need to re-
3033  * allocate the list to remove the extra space. Otherwise, if the
3034  * extents are being removed from the middle of the existing extent
3035  * entries, then we first need to move the extent records beginning
3036  * at idx + ext_diff up in the list to overwrite the records being
3037  * removed, then remove the extra space via kmem_realloc.
3038  */
3039 void
3040 xfs_iext_remove_direct(
3041         xfs_ifork_t     *ifp,           /* inode fork pointer */
3042         xfs_extnum_t    idx,            /* index to begin removing exts */
3043         int             ext_diff)       /* number of extents to remove */
3044 {
3045         xfs_extnum_t    nextents;       /* number of extents in file */
3046         int             new_size;       /* size of extents after removal */
3047
3048         ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3049         new_size = ifp->if_bytes -
3050                 (ext_diff * sizeof(xfs_bmbt_rec_t));
3051         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3052
3053         if (new_size == 0) {
3054                 xfs_iext_destroy(ifp);
3055                 return;
3056         }
3057         /* Move extents up in the list (if needed) */
3058         if (idx + ext_diff < nextents) {
3059                 memmove(&ifp->if_u1.if_extents[idx],
3060                         &ifp->if_u1.if_extents[idx + ext_diff],
3061                         (nextents - (idx + ext_diff)) *
3062                          sizeof(xfs_bmbt_rec_t));
3063         }
3064         memset(&ifp->if_u1.if_extents[nextents - ext_diff],
3065                 0, ext_diff * sizeof(xfs_bmbt_rec_t));
3066         /*
3067          * Reallocate the direct extent list. If the extents
3068          * will fit inside the inode then xfs_iext_realloc_direct
3069          * will switch from direct to inline extent allocation
3070          * mode for us.
3071          */
3072         xfs_iext_realloc_direct(ifp, new_size);
3073         ifp->if_bytes = new_size;
3074 }
3075
3076 /*
3077  * This is called when incore extents are being removed from the
3078  * indirection array and the extents being removed span multiple extent
3079  * buffers. The idx parameter contains the file extent index where we
3080  * want to begin removing extents, and the count parameter contains
3081  * how many extents need to be removed.
3082  *
3083  *    |-------|   |-------|
3084  *    | nex1  |   |       |    nex1 - number of extents before idx
3085  *    |-------|   | count |
3086  *    |       |   |       |    count - number of extents being removed at idx
3087  *    | count |   |-------|
3088  *    |       |   | nex2  |    nex2 - number of extents after idx + count
3089  *    |-------|   |-------|
3090  */
3091 void
3092 xfs_iext_remove_indirect(
3093         xfs_ifork_t     *ifp,           /* inode fork pointer */
3094         xfs_extnum_t    idx,            /* index to begin removing extents */
3095         int             count)          /* number of extents to remove */
3096 {
3097         xfs_ext_irec_t  *erp;           /* indirection array pointer */
3098         int             erp_idx = 0;    /* indirection array index */
3099         xfs_extnum_t    ext_cnt;        /* extents left to remove */
3100         xfs_extnum_t    ext_diff;       /* extents to remove in current list */
3101         xfs_extnum_t    nex1;           /* number of extents before idx */
3102         xfs_extnum_t    nex2;           /* extents after idx + count */
3103         int             page_idx = idx; /* index in target extent list */
3104
3105         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3106         erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
3107         ASSERT(erp != NULL);
3108         nex1 = page_idx;
3109         ext_cnt = count;
3110         while (ext_cnt) {
3111                 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
3112                 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
3113                 /*
3114                  * Check for deletion of entire list;
3115                  * xfs_iext_irec_remove() updates extent offsets.
3116                  */
3117                 if (ext_diff == erp->er_extcount) {
3118                         xfs_iext_irec_remove(ifp, erp_idx);
3119                         ext_cnt -= ext_diff;
3120                         nex1 = 0;
3121                         if (ext_cnt) {
3122                                 ASSERT(erp_idx < ifp->if_real_bytes /
3123                                         XFS_IEXT_BUFSZ);
3124                                 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3125                                 nex1 = 0;
3126                                 continue;
3127                         } else {
3128                                 break;
3129                         }
3130                 }
3131                 /* Move extents up (if needed) */
3132                 if (nex2) {
3133                         memmove(&erp->er_extbuf[nex1],
3134                                 &erp->er_extbuf[nex1 + ext_diff],
3135                                 nex2 * sizeof(xfs_bmbt_rec_t));
3136                 }
3137                 /* Zero out rest of page */
3138                 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
3139                         ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
3140                 /* Update remaining counters */
3141                 erp->er_extcount -= ext_diff;
3142                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
3143                 ext_cnt -= ext_diff;
3144                 nex1 = 0;
3145                 erp_idx++;
3146                 erp++;
3147         }
3148         ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
3149         xfs_iext_irec_compact(ifp);
3150 }
3151
3152 /*
3153  * Create, destroy, or resize a linear (direct) block of extents.
3154  */
3155 void
3156 xfs_iext_realloc_direct(
3157         xfs_ifork_t     *ifp,           /* inode fork pointer */
3158         int             new_size)       /* new size of extents */
3159 {
3160         int             rnew_size;      /* real new size of extents */
3161
3162         rnew_size = new_size;
3163
3164         ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
3165                 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
3166                  (new_size != ifp->if_real_bytes)));
3167
3168         /* Free extent records */
3169         if (new_size == 0) {
3170                 xfs_iext_destroy(ifp);
3171         }
3172         /* Resize direct extent list and zero any new bytes */
3173         else if (ifp->if_real_bytes) {
3174                 /* Check if extents will fit inside the inode */
3175                 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
3176                         xfs_iext_direct_to_inline(ifp, new_size /
3177                                 (uint)sizeof(xfs_bmbt_rec_t));
3178                         ifp->if_bytes = new_size;
3179                         return;
3180                 }
3181                 if (!is_power_of_2(new_size)){
3182                         rnew_size = roundup_pow_of_two(new_size);
3183                 }
3184                 if (rnew_size != ifp->if_real_bytes) {
3185                         ifp->if_u1.if_extents =
3186                                 kmem_realloc(ifp->if_u1.if_extents,
3187                                                 rnew_size,
3188                                                 ifp->if_real_bytes, KM_NOFS);
3189                 }
3190                 if (rnew_size > ifp->if_real_bytes) {
3191                         memset(&ifp->if_u1.if_extents[ifp->if_bytes /
3192                                 (uint)sizeof(xfs_bmbt_rec_t)], 0,
3193                                 rnew_size - ifp->if_real_bytes);
3194                 }
3195         }
3196         /*
3197          * Switch from the inline extent buffer to a direct
3198          * extent list. Be sure to include the inline extent
3199          * bytes in new_size.
3200          */
3201         else {
3202                 new_size += ifp->if_bytes;
3203                 if (!is_power_of_2(new_size)) {
3204                         rnew_size = roundup_pow_of_two(new_size);
3205                 }
3206                 xfs_iext_inline_to_direct(ifp, rnew_size);
3207         }
3208         ifp->if_real_bytes = rnew_size;
3209         ifp->if_bytes = new_size;
3210 }
3211
3212 /*
3213  * Switch from linear (direct) extent records to inline buffer.
3214  */
3215 void
3216 xfs_iext_direct_to_inline(
3217         xfs_ifork_t     *ifp,           /* inode fork pointer */
3218         xfs_extnum_t    nextents)       /* number of extents in file */
3219 {
3220         ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3221         ASSERT(nextents <= XFS_INLINE_EXTS);
3222         /*
3223          * The inline buffer was zeroed when we switched
3224          * from inline to direct extent allocation mode,
3225          * so we don't need to clear it here.
3226          */
3227         memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
3228                 nextents * sizeof(xfs_bmbt_rec_t));
3229         kmem_free(ifp->if_u1.if_extents);
3230         ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3231         ifp->if_real_bytes = 0;
3232 }
3233
3234 /*
3235  * Switch from inline buffer to linear (direct) extent records.
3236  * new_size should already be rounded up to the next power of 2
3237  * by the caller (when appropriate), so use new_size as it is.
3238  * However, since new_size may be rounded up, we can't update
3239  * if_bytes here. It is the caller's responsibility to update
3240  * if_bytes upon return.
3241  */
3242 void
3243 xfs_iext_inline_to_direct(
3244         xfs_ifork_t     *ifp,           /* inode fork pointer */
3245         int             new_size)       /* number of extents in file */
3246 {
3247         ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
3248         memset(ifp->if_u1.if_extents, 0, new_size);
3249         if (ifp->if_bytes) {
3250                 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
3251                         ifp->if_bytes);
3252                 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3253                         sizeof(xfs_bmbt_rec_t));
3254         }
3255         ifp->if_real_bytes = new_size;
3256 }
3257
3258 /*
3259  * Resize an extent indirection array to new_size bytes.
3260  */
3261 STATIC void
3262 xfs_iext_realloc_indirect(
3263         xfs_ifork_t     *ifp,           /* inode fork pointer */
3264         int             new_size)       /* new indirection array size */
3265 {
3266         int             nlists;         /* number of irec's (ex lists) */
3267         int             size;           /* current indirection array size */
3268
3269         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3270         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3271         size = nlists * sizeof(xfs_ext_irec_t);
3272         ASSERT(ifp->if_real_bytes);
3273         ASSERT((new_size >= 0) && (new_size != size));
3274         if (new_size == 0) {
3275                 xfs_iext_destroy(ifp);
3276         } else {
3277                 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
3278                         kmem_realloc(ifp->if_u1.if_ext_irec,
3279                                 new_size, size, KM_NOFS);
3280         }
3281 }
3282
3283 /*
3284  * Switch from indirection array to linear (direct) extent allocations.
3285  */
3286 STATIC void
3287 xfs_iext_indirect_to_direct(
3288          xfs_ifork_t    *ifp)           /* inode fork pointer */
3289 {
3290         xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
3291         xfs_extnum_t    nextents;       /* number of extents in file */
3292         int             size;           /* size of file extents */
3293
3294         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3295         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3296         ASSERT(nextents <= XFS_LINEAR_EXTS);
3297         size = nextents * sizeof(xfs_bmbt_rec_t);
3298
3299         xfs_iext_irec_compact_pages(ifp);
3300         ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
3301
3302         ep = ifp->if_u1.if_ext_irec->er_extbuf;
3303         kmem_free(ifp->if_u1.if_ext_irec);
3304         ifp->if_flags &= ~XFS_IFEXTIREC;
3305         ifp->if_u1.if_extents = ep;
3306         ifp->if_bytes = size;
3307         if (nextents < XFS_LINEAR_EXTS) {
3308                 xfs_iext_realloc_direct(ifp, size);
3309         }
3310 }
3311
3312 /*
3313  * Free incore file extents.
3314  */
3315 void
3316 xfs_iext_destroy(
3317         xfs_ifork_t     *ifp)           /* inode fork pointer */
3318 {
3319         if (ifp->if_flags & XFS_IFEXTIREC) {
3320                 int     erp_idx;
3321                 int     nlists;
3322
3323                 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3324                 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
3325                         xfs_iext_irec_remove(ifp, erp_idx);
3326                 }
3327                 ifp->if_flags &= ~XFS_IFEXTIREC;
3328         } else if (ifp->if_real_bytes) {
3329                 kmem_free(ifp->if_u1.if_extents);
3330         } else if (ifp->if_bytes) {
3331                 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3332                         sizeof(xfs_bmbt_rec_t));
3333         }
3334         ifp->if_u1.if_extents = NULL;
3335         ifp->if_real_bytes = 0;
3336         ifp->if_bytes = 0;
3337 }
3338
3339 /*
3340  * Return a pointer to the extent record for file system block bno.
3341  */
3342 xfs_bmbt_rec_host_t *                   /* pointer to found extent record */
3343 xfs_iext_bno_to_ext(
3344         xfs_ifork_t     *ifp,           /* inode fork pointer */
3345         xfs_fileoff_t   bno,            /* block number to search for */
3346         xfs_extnum_t    *idxp)          /* index of target extent */
3347 {
3348         xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
3349         xfs_filblks_t   blockcount = 0; /* number of blocks in extent */
3350         xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
3351         xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
3352         int             high;           /* upper boundary in search */
3353         xfs_extnum_t    idx = 0;        /* index of target extent */
3354         int             low;            /* lower boundary in search */
3355         xfs_extnum_t    nextents;       /* number of file extents */
3356         xfs_fileoff_t   startoff = 0;   /* start offset of extent */
3357
3358         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3359         if (nextents == 0) {
3360                 *idxp = 0;
3361                 return NULL;
3362         }
3363         low = 0;
3364         if (ifp->if_flags & XFS_IFEXTIREC) {
3365                 /* Find target extent list */
3366                 int     erp_idx = 0;
3367                 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
3368                 base = erp->er_extbuf;
3369                 high = erp->er_extcount - 1;
3370         } else {
3371                 base = ifp->if_u1.if_extents;
3372                 high = nextents - 1;
3373         }
3374         /* Binary search extent records */
3375         while (low <= high) {
3376                 idx = (low + high) >> 1;
3377                 ep = base + idx;
3378                 startoff = xfs_bmbt_get_startoff(ep);
3379                 blockcount = xfs_bmbt_get_blockcount(ep);
3380                 if (bno < startoff) {
3381                         high = idx - 1;
3382                 } else if (bno >= startoff + blockcount) {
3383                         low = idx + 1;
3384                 } else {
3385                         /* Convert back to file-based extent index */
3386                         if (ifp->if_flags & XFS_IFEXTIREC) {
3387                                 idx += erp->er_extoff;
3388                         }
3389                         *idxp = idx;
3390                         return ep;
3391                 }
3392         }
3393         /* Convert back to file-based extent index */
3394         if (ifp->if_flags & XFS_IFEXTIREC) {
3395                 idx += erp->er_extoff;
3396         }
3397         if (bno >= startoff + blockcount) {
3398                 if (++idx == nextents) {
3399                         ep = NULL;
3400                 } else {
3401                         ep = xfs_iext_get_ext(ifp, idx);
3402                 }
3403         }
3404         *idxp = idx;
3405         return ep;
3406 }
3407
3408 /*
3409  * Return a pointer to the indirection array entry containing the
3410  * extent record for filesystem block bno. Store the index of the
3411  * target irec in *erp_idxp.
3412  */
3413 xfs_ext_irec_t *                        /* pointer to found extent record */
3414 xfs_iext_bno_to_irec(
3415         xfs_ifork_t     *ifp,           /* inode fork pointer */
3416         xfs_fileoff_t   bno,            /* block number to search for */
3417         int             *erp_idxp)      /* irec index of target ext list */
3418 {
3419         xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
3420         xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
3421         int             erp_idx;        /* indirection array index */
3422         int             nlists;         /* number of extent irec's (lists) */
3423         int             high;           /* binary search upper limit */
3424         int             low;            /* binary search lower limit */
3425
3426         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3427         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3428         erp_idx = 0;
3429         low = 0;
3430         high = nlists - 1;
3431         while (low <= high) {
3432                 erp_idx = (low + high) >> 1;
3433                 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3434                 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
3435                 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
3436                         high = erp_idx - 1;
3437                 } else if (erp_next && bno >=
3438                            xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
3439                         low = erp_idx + 1;
3440                 } else {
3441                         break;
3442                 }
3443         }
3444         *erp_idxp = erp_idx;
3445         return erp;
3446 }
3447
3448 /*
3449  * Return a pointer to the indirection array entry containing the
3450  * extent record at file extent index *idxp. Store the index of the
3451  * target irec in *erp_idxp and store the page index of the target
3452  * extent record in *idxp.
3453  */
3454 xfs_ext_irec_t *
3455 xfs_iext_idx_to_irec(
3456         xfs_ifork_t     *ifp,           /* inode fork pointer */
3457         xfs_extnum_t    *idxp,          /* extent index (file -> page) */
3458         int             *erp_idxp,      /* pointer to target irec */
3459         int             realloc)        /* new bytes were just added */
3460 {
3461         xfs_ext_irec_t  *prev;          /* pointer to previous irec */
3462         xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
3463         int             erp_idx;        /* indirection array index */
3464         int             nlists;         /* number of irec's (ex lists) */
3465         int             high;           /* binary search upper limit */
3466         int             low;            /* binary search lower limit */
3467         xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
3468
3469         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3470         ASSERT(page_idx >= 0);
3471         ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3472         ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
3473
3474         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3475         erp_idx = 0;
3476         low = 0;
3477         high = nlists - 1;
3478
3479         /* Binary search extent irec's */
3480         while (low <= high) {
3481                 erp_idx = (low + high) >> 1;
3482                 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3483                 prev = erp_idx > 0 ? erp - 1 : NULL;
3484                 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
3485                      realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
3486                         high = erp_idx - 1;
3487                 } else if (page_idx > erp->er_extoff + erp->er_extcount ||
3488                            (page_idx == erp->er_extoff + erp->er_extcount &&
3489                             !realloc)) {
3490                         low = erp_idx + 1;
3491                 } else if (page_idx == erp->er_extoff + erp->er_extcount &&
3492                            erp->er_extcount == XFS_LINEAR_EXTS) {
3493                         ASSERT(realloc);
3494                         page_idx = 0;
3495                         erp_idx++;
3496                         erp = erp_idx < nlists ? erp + 1 : NULL;
3497                         break;
3498                 } else {
3499                         page_idx -= erp->er_extoff;
3500                         break;
3501                 }
3502         }
3503         *idxp = page_idx;
3504         *erp_idxp = erp_idx;
3505         return(erp);
3506 }
3507
3508 /*
3509  * Allocate and initialize an indirection array once the space needed
3510  * for incore extents increases above XFS_IEXT_BUFSZ.
3511  */
3512 void
3513 xfs_iext_irec_init(
3514         xfs_ifork_t     *ifp)           /* inode fork pointer */
3515 {
3516         xfs_ext_irec_t  *erp;           /* indirection array pointer */
3517         xfs_extnum_t    nextents;       /* number of extents in file */
3518
3519         ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3520         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3521         ASSERT(nextents <= XFS_LINEAR_EXTS);
3522
3523         erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
3524
3525         if (nextents == 0) {
3526                 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3527         } else if (!ifp->if_real_bytes) {
3528                 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
3529         } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
3530                 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
3531         }
3532         erp->er_extbuf = ifp->if_u1.if_extents;
3533         erp->er_extcount = nextents;
3534         erp->er_extoff = 0;
3535
3536         ifp->if_flags |= XFS_IFEXTIREC;
3537         ifp->if_real_bytes = XFS_IEXT_BUFSZ;
3538         ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
3539         ifp->if_u1.if_ext_irec = erp;
3540
3541         return;
3542 }
3543
3544 /*
3545  * Allocate and initialize a new entry in the indirection array.
3546  */
3547 xfs_ext_irec_t *
3548 xfs_iext_irec_new(
3549         xfs_ifork_t     *ifp,           /* inode fork pointer */
3550         int             erp_idx)        /* index for new irec */
3551 {
3552         xfs_ext_irec_t  *erp;           /* indirection array pointer */
3553         int             i;              /* loop counter */
3554         int             nlists;         /* number of irec's (ex lists) */
3555
3556         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3557         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3558
3559         /* Resize indirection array */
3560         xfs_iext_realloc_indirect(ifp, ++nlists *
3561                                   sizeof(xfs_ext_irec_t));
3562         /*
3563          * Move records down in the array so the
3564          * new page can use erp_idx.
3565          */
3566         erp = ifp->if_u1.if_ext_irec;
3567         for (i = nlists - 1; i > erp_idx; i--) {
3568                 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
3569         }
3570         ASSERT(i == erp_idx);
3571
3572         /* Initialize new extent record */
3573         erp = ifp->if_u1.if_ext_irec;
3574         erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3575         ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3576         memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
3577         erp[erp_idx].er_extcount = 0;
3578         erp[erp_idx].er_extoff = erp_idx > 0 ?
3579                 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
3580         return (&erp[erp_idx]);
3581 }
3582
3583 /*
3584  * Remove a record from the indirection array.
3585  */
3586 void
3587 xfs_iext_irec_remove(
3588         xfs_ifork_t     *ifp,           /* inode fork pointer */
3589         int             erp_idx)        /* irec index to remove */
3590 {
3591         xfs_ext_irec_t  *erp;           /* indirection array pointer */
3592         int             i;              /* loop counter */
3593         int             nlists;         /* number of irec's (ex lists) */
3594
3595         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3596         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3597         erp = &ifp->if_u1.if_ext_irec[erp_idx];
3598         if (erp->er_extbuf) {
3599                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
3600                         -erp->er_extcount);
3601                 kmem_free(erp->er_extbuf);
3602         }
3603         /* Compact extent records */
3604         erp = ifp->if_u1.if_ext_irec;
3605         for (i = erp_idx; i < nlists - 1; i++) {
3606                 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
3607         }
3608         /*
3609          * Manually free the last extent record from the indirection
3610          * array.  A call to xfs_iext_realloc_indirect() with a size
3611          * of zero would result in a call to xfs_iext_destroy() which
3612          * would in turn call this function again, creating a nasty
3613          * infinite loop.
3614          */
3615         if (--nlists) {
3616                 xfs_iext_realloc_indirect(ifp,
3617                         nlists * sizeof(xfs_ext_irec_t));
3618         } else {
3619                 kmem_free(ifp->if_u1.if_ext_irec);
3620         }
3621         ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3622 }
3623
3624 /*
3625  * This is called to clean up large amounts of unused memory allocated
3626  * by the indirection array.  Before compacting anything though, verify
3627  * that the indirection array is still needed and switch back to the
3628  * linear extent list (or even the inline buffer) if possible.  The
3629  * compaction policy is as follows:
3630  *
3631  *    Full Compaction: Extents fit into a single page (or inline buffer)
3632  * Partial Compaction: Extents occupy less than 50% of allocated space
3633  *      No Compaction: Extents occupy at least 50% of allocated space
3634  */
3635 void
3636 xfs_iext_irec_compact(
3637         xfs_ifork_t     *ifp)           /* inode fork pointer */
3638 {
3639         xfs_extnum_t    nextents;       /* number of extents in file */
3640         int             nlists;         /* number of irec's (ex lists) */
3641
3642         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3643         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3644         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3645
3646         if (nextents == 0) {
3647                 xfs_iext_destroy(ifp);
3648         } else if (nextents <= XFS_INLINE_EXTS) {
3649                 xfs_iext_indirect_to_direct(ifp);
3650                 xfs_iext_direct_to_inline(ifp, nextents);
3651         } else if (nextents <= XFS_LINEAR_EXTS) {
3652                 xfs_iext_indirect_to_direct(ifp);
3653         } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
3654                 xfs_iext_irec_compact_pages(ifp);
3655         }
3656 }
3657
3658 /*
3659  * Combine extents from neighboring extent pages.
3660  */
3661 void
3662 xfs_iext_irec_compact_pages(
3663         xfs_ifork_t     *ifp)           /* inode fork pointer */
3664 {
3665         xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
3666         int             erp_idx = 0;    /* indirection array index */
3667         int             nlists;         /* number of irec's (ex lists) */
3668
3669         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3670         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3671         while (erp_idx < nlists - 1) {
3672                 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3673                 erp_next = erp + 1;
3674                 if (erp_next->er_extcount <=
3675                     (XFS_LINEAR_EXTS - erp->er_extcount)) {
3676                         memcpy(&erp->er_extbuf[erp->er_extcount],
3677                                 erp_next->er_extbuf, erp_next->er_extcount *
3678                                 sizeof(xfs_bmbt_rec_t));
3679                         erp->er_extcount += erp_next->er_extcount;
3680                         /*
3681                          * Free page before removing extent record
3682                          * so er_extoffs don't get modified in
3683                          * xfs_iext_irec_remove.
3684                          */
3685                         kmem_free(erp_next->er_extbuf);
3686                         erp_next->er_extbuf = NULL;
3687                         xfs_iext_irec_remove(ifp, erp_idx + 1);
3688                         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3689                 } else {
3690                         erp_idx++;
3691                 }
3692         }
3693 }
3694
3695 /*
3696  * This is called to update the er_extoff field in the indirection
3697  * array when extents have been added or removed from one of the
3698  * extent lists. erp_idx contains the irec index to begin updating
3699  * at and ext_diff contains the number of extents that were added
3700  * or removed.
3701  */
3702 void
3703 xfs_iext_irec_update_extoffs(
3704         xfs_ifork_t     *ifp,           /* inode fork pointer */
3705         int             erp_idx,        /* irec index to update */
3706         int             ext_diff)       /* number of new extents */
3707 {
3708         int             i;              /* loop counter */
3709         int             nlists;         /* number of irec's (ex lists */
3710
3711         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3712         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3713         for (i = erp_idx; i < nlists; i++) {
3714                 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
3715         }
3716 }