aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorroot <root@artemis.panaceas.org>2015-12-25 04:40:36 +0000
committerroot <root@artemis.panaceas.org>2015-12-25 04:40:36 +0000
commit849369d6c66d3054688672f97d31fceb8e8230fb (patch)
tree6135abc790ca67dedbe07c39806591e70eda81ce /fs/xfs
downloadlinux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.tar.gz
linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.tar.bz2
linux-3.0.35-kobo-849369d6c66d3054688672f97d31fceb8e8230fb.zip
initial_commit
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Kconfig82
-rw-r--r--fs/xfs/Makefile111
-rw-r--r--fs/xfs/linux-2.6/kmem.c132
-rw-r--r--fs/xfs/linux-2.6/kmem.h124
-rw-r--r--fs/xfs/linux-2.6/mrlock.h90
-rw-r--r--fs/xfs/linux-2.6/time.h36
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c464
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1506
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h68
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c1899
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h351
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c222
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h10
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c250
-rw-r--r--fs/xfs/linux-2.6/xfs_export.h72
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c1114
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c96
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c43
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c1556
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.h85
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c672
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h237
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c778
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.h30
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h307
-rw-r--r--fs/xfs/linux-2.6/xfs_message.c108
-rw-r--r--fs/xfs/linux-2.6/xfs_message.h39
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c139
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c122
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.h223
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c1720
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h87
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c1132
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h62
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c252
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.h102
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c56
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h1776
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h64
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c241
-rw-r--r--fs/xfs/quota/xfs_dquot.c1496
-rw-r--r--fs/xfs/quota/xfs_dquot.h143
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c533
-rw-r--r--fs/xfs/quota/xfs_dquot_item.h48
-rw-r--r--fs/xfs/quota/xfs_qm.c2462
-rw-r--r--fs/xfs/quota/xfs_qm.h172
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c176
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c105
-rw-r--r--fs/xfs/quota/xfs_qm_stats.h53
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c1259
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h53
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c895
-rw-r--r--fs/xfs/support/uuid.c63
-rw-r--r--fs/xfs/support/uuid.h29
-rw-r--r--fs/xfs/xfs.h29
-rw-r--r--fs/xfs/xfs_acl.h62
-rw-r--r--fs/xfs/xfs_ag.h289
-rw-r--r--fs/xfs/xfs_alloc.c3047
-rw-r--r--fs/xfs/xfs_alloc.h253
-rw-r--r--fs/xfs/xfs_alloc_btree.c458
-rw-r--r--fs/xfs/xfs_alloc_btree.h110
-rw-r--r--fs/xfs/xfs_arch.h136
-rw-r--r--fs/xfs/xfs_attr.c2230
-rw-r--r--fs/xfs/xfs_attr.h146
-rw-r--r--fs/xfs/xfs_attr_leaf.c2979
-rw-r--r--fs/xfs/xfs_attr_leaf.h265
-rw-r--r--fs/xfs/xfs_attr_sf.h70
-rw-r--r--fs/xfs/xfs_bit.c120
-rw-r--r--fs/xfs/xfs_bit.h84
-rw-r--r--fs/xfs/xfs_bmap.c6139
-rw-r--r--fs/xfs/xfs_bmap.h402
-rw-r--r--fs/xfs/xfs_bmap_btree.c919
-rw-r--r--fs/xfs/xfs_bmap_btree.h240
-rw-r--r--fs/xfs/xfs_btree.c3679
-rw-r--r--fs/xfs/xfs_btree.h455
-rw-r--r--fs/xfs/xfs_btree_trace.c249
-rw-r--r--fs/xfs/xfs_btree_trace.h99
-rw-r--r--fs/xfs/xfs_buf_item.c1064
-rw-r--r--fs/xfs/xfs_buf_item.h130
-rw-r--r--fs/xfs/xfs_da_btree.c2463
-rw-r--r--fs/xfs/xfs_da_btree.h281
-rw-r--r--fs/xfs/xfs_dfrag.c457
-rw-r--r--fs/xfs/xfs_dfrag.h53
-rw-r--r--fs/xfs/xfs_dinode.h216
-rw-r--r--fs/xfs/xfs_dir2.c764
-rw-r--r--fs/xfs/xfs_dir2.h106
-rw-r--r--fs/xfs/xfs_dir2_block.c1233
-rw-r--r--fs/xfs/xfs_dir2_block.h92
-rw-r--r--fs/xfs/xfs_dir2_data.c833
-rw-r--r--fs/xfs/xfs_dir2_data.h184
-rw-r--r--fs/xfs/xfs_dir2_leaf.c1881
-rw-r--r--fs/xfs/xfs_dir2_leaf.h253
-rw-r--r--fs/xfs/xfs_dir2_node.c2076
-rw-r--r--fs/xfs/xfs_dir2_node.h100
-rw-r--r--fs/xfs/xfs_dir2_sf.c1267
-rw-r--r--fs/xfs/xfs_dir2_sf.h171
-rw-r--r--fs/xfs/xfs_error.c185
-rw-r--r--fs/xfs/xfs_error.h161
-rw-r--r--fs/xfs/xfs_extfree_item.c520
-rw-r--r--fs/xfs/xfs_extfree_item.h161
-rw-r--r--fs/xfs/xfs_filestream.c824
-rw-r--r--fs/xfs/xfs_filestream.h74
-rw-r--r--fs/xfs/xfs_fs.h501
-rw-r--r--fs/xfs/xfs_fsops.c671
-rw-r--r--fs/xfs/xfs_fsops.h30
-rw-r--r--fs/xfs/xfs_ialloc.c1563
-rw-r--r--fs/xfs/xfs_ialloc.h164
-rw-r--r--fs/xfs/xfs_ialloc_btree.c346
-rw-r--r--fs/xfs/xfs_ialloc_btree.h112
-rw-r--r--fs/xfs/xfs_iget.c727
-rw-r--r--fs/xfs/xfs_inode.c4145
-rw-r--r--fs/xfs/xfs_inode.h600
-rw-r--r--fs/xfs/xfs_inode_item.c1060
-rw-r--r--fs/xfs/xfs_inode_item.h166
-rw-r--r--fs/xfs/xfs_inum.h80
-rw-r--r--fs/xfs/xfs_iomap.c748
-rw-r--r--fs/xfs/xfs_iomap.h32
-rw-r--r--fs/xfs/xfs_itable.c736
-rw-r--r--fs/xfs/xfs_itable.h106
-rw-r--r--fs/xfs/xfs_log.c3767
-rw-r--r--fs/xfs/xfs_log.h198
-rw-r--r--fs/xfs/xfs_log_cil.c804
-rw-r--r--fs/xfs/xfs_log_priv.h668
-rw-r--r--fs/xfs/xfs_log_recover.c3843
-rw-r--r--fs/xfs/xfs_log_recover.h66
-rw-r--r--fs/xfs/xfs_mount.c2585
-rw-r--r--fs/xfs/xfs_mount.h402
-rw-r--r--fs/xfs/xfs_mru_cache.c576
-rw-r--r--fs/xfs/xfs_mru_cache.h53
-rw-r--r--fs/xfs/xfs_quota.h390
-rw-r--r--fs/xfs/xfs_rename.c358
-rw-r--r--fs/xfs/xfs_rtalloc.c2325
-rw-r--r--fs/xfs/xfs_rtalloc.h166
-rw-r--r--fs/xfs/xfs_rw.c175
-rw-r--r--fs/xfs/xfs_rw.h49
-rw-r--r--fs/xfs/xfs_sb.h543
-rw-r--r--fs/xfs/xfs_trans.c2026
-rw-r--r--fs/xfs/xfs_trans.h506
-rw-r--r--fs/xfs/xfs_trans_ail.c890
-rw-r--r--fs/xfs/xfs_trans_buf.c879
-rw-r--r--fs/xfs/xfs_trans_extfree.c134
-rw-r--r--fs/xfs/xfs_trans_inode.c190
-rw-r--r--fs/xfs/xfs_trans_priv.h143
-rw-r--r--fs/xfs/xfs_trans_space.h86
-rw-r--r--fs/xfs/xfs_types.h156
-rw-r--r--fs/xfs/xfs_utils.c321
-rw-r--r--fs/xfs/xfs_utils.h27
-rw-r--r--fs/xfs/xfs_vnodeops.c2852
-rw-r--r--fs/xfs/xfs_vnodeops.h63
149 files changed, 98982 insertions, 0 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
new file mode 100644
index 00000000..6100ec0f
--- /dev/null
+++ b/fs/xfs/Kconfig
@@ -0,0 +1,82 @@
+config XFS_FS
+ tristate "XFS filesystem support"
+ depends on BLOCK
+ select EXPORTFS
+ help
+ XFS is a high performance journaling filesystem which originated
+ on the SGI IRIX platform. It is completely multi-threaded, can
+ support large files and large filesystems, extended attributes,
+ variable block sizes, is extent based, and makes extensive use of
+ Btrees (directories, extents, free space) to aid both performance
+ and scalability.
+
+ Refer to the documentation at <http://oss.sgi.com/projects/xfs/>
+ for complete details. This implementation is on-disk compatible
+ with the IRIX version of XFS.
+
+ To compile this file system support as a module, choose M here: the
+ module will be called xfs. Be aware, however, that if the file
+ system of your root partition is compiled as a module, you'll need
+ to use an initial ramdisk (initrd) to boot.
+
+config XFS_QUOTA
+ bool "XFS Quota support"
+ depends on XFS_FS
+ select QUOTACTL
+ help
+ If you say Y here, you will be able to set limits for disk usage on
+ a per user and/or a per group basis under XFS. XFS considers quota
+ information as filesystem metadata and uses journaling to provide a
+ higher level guarantee of consistency. The on-disk data format for
+ quota is also compatible with the IRIX version of XFS, allowing a
+ filesystem to be migrated between Linux and IRIX without any need
+ for conversion.
+
+ If unsure, say N. More comprehensive documentation can be found in
+ README.quota in the xfsprogs package. XFS quota can be used either
+ with or without the generic quota support enabled (CONFIG_QUOTA) -
+ they are completely independent subsystems.
+
+config XFS_POSIX_ACL
+ bool "XFS POSIX ACL support"
+ depends on XFS_FS
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N.
+
+config XFS_RT
+ bool "XFS Realtime subvolume support"
+ depends on XFS_FS
+ help
+ If you say Y here you will be able to mount and use XFS filesystems
+ which contain a realtime subvolume. The realtime subvolume is a
+ separate area of disk space where only file data is stored. It was
+ originally designed to provide deterministic data rates suitable
+ for media streaming applications, but is also useful as a generic
+ mechanism for ensuring data and metadata/log I/Os are completely
+ separated. Regular file I/Os are isolated to a separate device
+ from all other requests, and this can be done quite transparently
+ to applications via the inherit-realtime directory inode flag.
+
+ See the xfs man page in section 5 for additional information.
+
+ If unsure, say N.
+
+config XFS_DEBUG
+ bool "XFS Debugging support (EXPERIMENTAL)"
+ depends on XFS_FS && EXPERIMENTAL
+ help
+ Say Y here to get an XFS build with many debugging features,
+ including ASSERT checks, function wrappers around macros,
+ and extra sanity-checking functions in various code paths.
+
+ Note that the resulting code will be HUGE and SLOW, and probably
+ not useful unless you are debugging a particular problem.
+
+ Say N unless you are an XFS developer, or you play one on TV.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
new file mode 100644
index 00000000..284a7c89
--- /dev/null
+++ b/fs/xfs/Makefile
@@ -0,0 +1,111 @@
+#
+# Copyright (c) 2000-2005 Silicon Graphics, Inc.
+# All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#
+
+ccflags-y := -I$(src) -I$(src)/linux-2.6
+ccflags-$(CONFIG_XFS_DEBUG) += -g
+
+XFS_LINUX := linux-2.6
+
+obj-$(CONFIG_XFS_FS) += xfs.o
+
+xfs-y += linux-2.6/xfs_trace.o
+
+xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \
+ xfs_dquot.o \
+ xfs_dquot_item.o \
+ xfs_trans_dquot.o \
+ xfs_qm_syscalls.o \
+ xfs_qm_bhv.o \
+ xfs_qm.o)
+xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o
+
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o
+endif
+
+xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_POSIX_ACL) += $(XFS_LINUX)/xfs_acl.o
+xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o
+xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o
+xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o
+
+
+xfs-y += xfs_alloc.o \
+ xfs_alloc_btree.o \
+ xfs_attr.o \
+ xfs_attr_leaf.o \
+ xfs_bit.o \
+ xfs_bmap.o \
+ xfs_bmap_btree.o \
+ xfs_btree.o \
+ xfs_buf_item.o \
+ xfs_da_btree.o \
+ xfs_dir2.o \
+ xfs_dir2_block.o \
+ xfs_dir2_data.o \
+ xfs_dir2_leaf.o \
+ xfs_dir2_node.o \
+ xfs_dir2_sf.o \
+ xfs_error.o \
+ xfs_extfree_item.o \
+ xfs_filestream.o \
+ xfs_fsops.o \
+ xfs_ialloc.o \
+ xfs_ialloc_btree.o \
+ xfs_iget.o \
+ xfs_inode.o \
+ xfs_inode_item.o \
+ xfs_iomap.o \
+ xfs_itable.o \
+ xfs_dfrag.o \
+ xfs_log.o \
+ xfs_log_cil.o \
+ xfs_log_recover.o \
+ xfs_mount.o \
+ xfs_mru_cache.o \
+ xfs_rename.o \
+ xfs_trans.o \
+ xfs_trans_ail.o \
+ xfs_trans_buf.o \
+ xfs_trans_extfree.o \
+ xfs_trans_inode.o \
+ xfs_utils.o \
+ xfs_vnodeops.o \
+ xfs_rw.o
+
+xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o
+
+# Objects in linux/
+xfs-y += $(addprefix $(XFS_LINUX)/, \
+ kmem.o \
+ xfs_aops.o \
+ xfs_buf.o \
+ xfs_discard.o \
+ xfs_export.o \
+ xfs_file.o \
+ xfs_fs_subr.o \
+ xfs_globals.o \
+ xfs_ioctl.o \
+ xfs_iops.o \
+ xfs_message.o \
+ xfs_super.o \
+ xfs_sync.o \
+ xfs_xattr.o)
+
+# Objects in support/
+xfs-y += support/uuid.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
new file mode 100644
index 00000000..a907de56
--- /dev/null
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include "time.h"
+#include "kmem.h"
+#include "xfs_message.h"
+
+/*
+ * Greedy allocation. May fail and may return vmalloced memory.
+ *
+ * Must be freed using kmem_free_large.
+ */
+void *
+kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
+{
+ void *ptr;
+ size_t kmsize = maxsize;
+
+ while (!(ptr = kmem_zalloc_large(kmsize))) {
+ if ((kmsize >>= 1) <= minsize)
+ kmsize = minsize;
+ }
+ if (ptr)
+ *size = kmsize;
+ return ptr;
+}
+
+void *
+kmem_alloc(size_t size, unsigned int __nocast flags)
+{
+ int retries = 0;
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
+
+ do {
+ ptr = kmalloc(size, lflags);
+ if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ return ptr;
+ if (!(++retries % 100))
+ xfs_err(NULL,
+ "possible memory allocation deadlock in %s (mode:0x%x)",
+ __func__, lflags);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ } while (1);
+}
+
+void *
+kmem_zalloc(size_t size, unsigned int __nocast flags)
+{
+ void *ptr;
+
+ ptr = kmem_alloc(size, flags);
+ if (ptr)
+ memset((char *)ptr, 0, (int)size);
+ return ptr;
+}
+
+void
+kmem_free(const void *ptr)
+{
+ if (!is_vmalloc_addr(ptr)) {
+ kfree(ptr);
+ } else {
+ vfree(ptr);
+ }
+}
+
+void *
+kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
+ unsigned int __nocast flags)
+{
+ void *new;
+
+ new = kmem_alloc(newsize, flags);
+ if (ptr) {
+ if (new)
+ memcpy(new, ptr,
+ ((oldsize < newsize) ? oldsize : newsize));
+ kmem_free(ptr);
+ }
+ return new;
+}
+
+void *
+kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
+{
+ int retries = 0;
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
+
+ do {
+ ptr = kmem_cache_alloc(zone, lflags);
+ if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ return ptr;
+ if (!(++retries % 100))
+ xfs_err(NULL,
+ "possible memory allocation deadlock in %s (mode:0x%x)",
+ __func__, lflags);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ } while (1);
+}
+
+void *
+kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
+{
+ void *ptr;
+
+ ptr = kmem_zone_alloc(zone, flags);
+ if (ptr)
+ memset((char *)ptr, 0, kmem_cache_size(zone));
+ return ptr;
+}
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
new file mode 100644
index 00000000..f7c8f7a9
--- /dev/null
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+/*
+ * General memory allocation interfaces
+ */
+
+#define KM_SLEEP 0x0001u
+#define KM_NOSLEEP 0x0002u
+#define KM_NOFS 0x0004u
+#define KM_MAYFAIL 0x0008u
+
+/*
+ * We use a special process flag to avoid recursive callbacks into
+ * the filesystem during transactions. We will also issue our own
+ * warnings, so we explicitly skip any generic ones (silly of us).
+ */
+static inline gfp_t
+kmem_flags_convert(unsigned int __nocast flags)
+{
+ gfp_t lflags;
+
+ BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
+
+ if (flags & KM_NOSLEEP) {
+ lflags = GFP_ATOMIC | __GFP_NOWARN;
+ } else {
+ lflags = GFP_KERNEL | __GFP_NOWARN;
+ if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ lflags &= ~__GFP_FS;
+ }
+ return lflags;
+}
+
+extern void *kmem_alloc(size_t, unsigned int __nocast);
+extern void *kmem_zalloc(size_t, unsigned int __nocast);
+extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
+extern void kmem_free(const void *);
+
+static inline void *kmem_zalloc_large(size_t size)
+{
+ void *ptr;
+
+ ptr = vmalloc(size);
+ if (ptr)
+ memset(ptr, 0, size);
+ return ptr;
+}
+static inline void kmem_free_large(void *ptr)
+{
+ vfree(ptr);
+}
+
+extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
+
+/*
+ * Zone interfaces
+ */
+
+#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN
+#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT
+#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
+
+#define kmem_zone kmem_cache
+#define kmem_zone_t struct kmem_cache
+
+static inline kmem_zone_t *
+kmem_zone_init(int size, char *zone_name)
+{
+ return kmem_cache_create(zone_name, size, 0, 0, NULL);
+}
+
+static inline kmem_zone_t *
+kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
+ void (*construct)(void *))
+{
+ return kmem_cache_create(zone_name, size, 0, flags, construct);
+}
+
+static inline void
+kmem_zone_free(kmem_zone_t *zone, void *ptr)
+{
+ kmem_cache_free(zone, ptr);
+}
+
+static inline void
+kmem_zone_destroy(kmem_zone_t *zone)
+{
+ if (zone)
+ kmem_cache_destroy(zone);
+}
+
+extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
+extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
+
+static inline int
+kmem_shake_allow(gfp_t gfp_mask)
+{
+ return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
+}
+
+#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h
new file mode 100644
index 00000000..ff6a1987
--- /dev/null
+++ b/fs/xfs/linux-2.6/mrlock.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SUPPORT_MRLOCK_H__
+#define __XFS_SUPPORT_MRLOCK_H__
+
+#include <linux/rwsem.h>
+
+typedef struct {
+ struct rw_semaphore mr_lock;
+#ifdef DEBUG
+ int mr_writer;
+#endif
+} mrlock_t;
+
+#ifdef DEBUG
+#define mrinit(mrp, name) \
+ do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
+#else
+#define mrinit(mrp, name) \
+ do { init_rwsem(&(mrp)->mr_lock); } while (0)
+#endif
+
+#define mrlock_init(mrp, t,n,s) mrinit(mrp, n)
+#define mrfree(mrp) do { } while (0)
+
+static inline void mraccess_nested(mrlock_t *mrp, int subclass)
+{
+ down_read_nested(&mrp->mr_lock, subclass);
+}
+
+static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
+{
+ down_write_nested(&mrp->mr_lock, subclass);
+#ifdef DEBUG
+ mrp->mr_writer = 1;
+#endif
+}
+
+static inline int mrtryaccess(mrlock_t *mrp)
+{
+ return down_read_trylock(&mrp->mr_lock);
+}
+
+static inline int mrtryupdate(mrlock_t *mrp)
+{
+ if (!down_write_trylock(&mrp->mr_lock))
+ return 0;
+#ifdef DEBUG
+ mrp->mr_writer = 1;
+#endif
+ return 1;
+}
+
+static inline void mrunlock_excl(mrlock_t *mrp)
+{
+#ifdef DEBUG
+ mrp->mr_writer = 0;
+#endif
+ up_write(&mrp->mr_lock);
+}
+
+static inline void mrunlock_shared(mrlock_t *mrp)
+{
+ up_read(&mrp->mr_lock);
+}
+
+static inline void mrdemote(mrlock_t *mrp)
+{
+#ifdef DEBUG
+ mrp->mr_writer = 0;
+#endif
+ downgrade_write(&mrp->mr_lock);
+}
+
+#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/linux-2.6/time.h
new file mode 100644
index 00000000..387e695a
--- /dev/null
+++ b/fs/xfs/linux-2.6/time.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SUPPORT_TIME_H__
+#define __XFS_SUPPORT_TIME_H__
+
+#include <linux/sched.h>
+#include <linux/time.h>
+
+typedef struct timespec timespec_t;
+
+static inline void delay(long ticks)
+{
+ schedule_timeout_uninterruptible(ticks);
+}
+
+static inline void nanotime(struct timespec *tvp)
+{
+ *tvp = CURRENT_TIME;
+}
+
+#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
new file mode 100644
index 00000000..f86e0348
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+
+
+/*
+ * Locking scheme:
+ * - all ACL updates are protected by inode->i_mutex, which is taken before
+ * calling into this file.
+ */
+
+STATIC struct posix_acl *
+xfs_acl_from_disk(struct xfs_acl *aclp)
+{
+ struct posix_acl_entry *acl_e;
+ struct posix_acl *acl;
+ struct xfs_acl_entry *ace;
+ unsigned int count, i;
+
+ count = be32_to_cpu(aclp->acl_cnt);
+ if (count > XFS_ACL_MAX_ENTRIES)
+ return ERR_PTR(-EFSCORRUPTED);
+
+ acl = posix_acl_alloc(count, GFP_KERNEL);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < count; i++) {
+ acl_e = &acl->a_entries[i];
+ ace = &aclp->acl_entry[i];
+
+ /*
+ * The tag is 32 bits on disk and 16 bits in core.
+ *
+ * Because every access to it goes through the core
+ * format first this is not a problem.
+ */
+ acl_e->e_tag = be32_to_cpu(ace->ae_tag);
+ acl_e->e_perm = be16_to_cpu(ace->ae_perm);
+
+ switch (acl_e->e_tag) {
+ case ACL_USER:
+ case ACL_GROUP:
+ acl_e->e_id = be32_to_cpu(ace->ae_id);
+ break;
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ acl_e->e_id = ACL_UNDEFINED_ID;
+ break;
+ default:
+ goto fail;
+ }
+ }
+ return acl;
+
+fail:
+ posix_acl_release(acl);
+ return ERR_PTR(-EINVAL);
+}
+
+STATIC void
+xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
+{
+ const struct posix_acl_entry *acl_e;
+ struct xfs_acl_entry *ace;
+ int i;
+
+ aclp->acl_cnt = cpu_to_be32(acl->a_count);
+ for (i = 0; i < acl->a_count; i++) {
+ ace = &aclp->acl_entry[i];
+ acl_e = &acl->a_entries[i];
+
+ ace->ae_tag = cpu_to_be32(acl_e->e_tag);
+ ace->ae_id = cpu_to_be32(acl_e->e_id);
+ ace->ae_perm = cpu_to_be16(acl_e->e_perm);
+ }
+}
+
+struct posix_acl *
+xfs_get_acl(struct inode *inode, int type)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct posix_acl *acl;
+ struct xfs_acl *xfs_acl;
+ int len = sizeof(struct xfs_acl);
+ unsigned char *ea_name;
+ int error;
+
+ acl = get_cached_acl(inode, type);
+ if (acl != ACL_NOT_CACHED)
+ return acl;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ ea_name = SGI_ACL_FILE;
+ break;
+ case ACL_TYPE_DEFAULT:
+ ea_name = SGI_ACL_DEFAULT;
+ break;
+ default:
+ BUG();
+ }
+
+ /*
+ * If we have a cached ACLs value just return it, not need to
+ * go out to the disk.
+ */
+
+ xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+ if (!xfs_acl)
+ return ERR_PTR(-ENOMEM);
+
+ error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+ &len, ATTR_ROOT);
+ if (error) {
+ /*
+ * If the attribute doesn't exist make sure we have a negative
+ * cache entry, for any other error assume it is transient and
+ * leave the cache entry as ACL_NOT_CACHED.
+ */
+ if (error == -ENOATTR) {
+ acl = NULL;
+ goto out_update_cache;
+ }
+ goto out;
+ }
+
+ acl = xfs_acl_from_disk(xfs_acl);
+ if (IS_ERR(acl))
+ goto out;
+
+ out_update_cache:
+ set_cached_acl(inode, type, acl);
+ out:
+ kfree(xfs_acl);
+ return acl;
+}
+
+STATIC int
+xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ unsigned char *ea_name;
+ int error;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ ea_name = SGI_ACL_FILE;
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ ea_name = SGI_ACL_DEFAULT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (acl) {
+ struct xfs_acl *xfs_acl;
+ int len;
+
+ xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+ if (!xfs_acl)
+ return -ENOMEM;
+
+ xfs_acl_to_disk(xfs_acl, acl);
+ len = sizeof(struct xfs_acl) -
+ (sizeof(struct xfs_acl_entry) *
+ (XFS_ACL_MAX_ENTRIES - acl->a_count));
+
+ error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
+ len, ATTR_ROOT);
+
+ kfree(xfs_acl);
+ } else {
+ /*
+ * A NULL ACL argument means we want to remove the ACL.
+ */
+ error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+
+ /*
+ * If the attribute didn't exist to start with that's fine.
+ */
+ if (error == -ENOATTR)
+ error = 0;
+ }
+
+ if (!error)
+ set_cached_acl(inode, type, acl);
+ return error;
+}
+
+int
+xfs_check_acl(struct inode *inode, int mask, unsigned int flags)
+{
+ struct xfs_inode *ip;
+ struct posix_acl *acl;
+ int error = -EAGAIN;
+
+ ip = XFS_I(inode);
+ trace_xfs_check_acl(ip);
+
+ /*
+ * If there is no attribute fork no ACL exists on this inode and
+ * we can skip the whole exercise.
+ */
+ if (!XFS_IFORK_Q(ip))
+ return -EAGAIN;
+
+ if (flags & IPERM_FLAG_RCU) {
+ if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+ return -ECHILD;
+ return -EAGAIN;
+ }
+
+ acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl) {
+ error = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ }
+
+ return error;
+}
+
+static int
+xfs_set_mode(struct inode *inode, mode_t mode)
+{
+ int error = 0;
+
+ if (mode != inode->i_mode) {
+ struct iattr iattr;
+
+ iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
+ iattr.ia_mode = mode;
+ iattr.ia_ctime = current_fs_time(inode->i_sb);
+
+ error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+ }
+
+ return error;
+}
+
+static int
+xfs_acl_exists(struct inode *inode, unsigned char *name)
+{
+ int len = sizeof(struct xfs_acl);
+
+ return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
+ ATTR_ROOT|ATTR_KERNOVAL) == 0);
+}
+
+int
+posix_acl_access_exists(struct inode *inode)
+{
+ return xfs_acl_exists(inode, SGI_ACL_FILE);
+}
+
+int
+posix_acl_default_exists(struct inode *inode)
+{
+ if (!S_ISDIR(inode->i_mode))
+ return 0;
+ return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
+}
+
+/*
+ * No need for i_mutex because the inode is not yet exposed to the VFS.
+ */
+int
+xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl)
+{
+ struct posix_acl *clone;
+ mode_t mode;
+ int error = 0, inherit = 0;
+
+ if (S_ISDIR(inode->i_mode)) {
+ error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+ if (error)
+ return error;
+ }
+
+ clone = posix_acl_clone(default_acl, GFP_KERNEL);
+ if (!clone)
+ return -ENOMEM;
+
+ mode = inode->i_mode;
+ error = posix_acl_create_masq(clone, &mode);
+ if (error < 0)
+ goto out_release_clone;
+
+ /*
+ * If posix_acl_create_masq returns a positive value we need to
+ * inherit a permission that can't be represented using the Unix
+ * mode bits and we actually need to set an ACL.
+ */
+ if (error > 0)
+ inherit = 1;
+
+ error = xfs_set_mode(inode, mode);
+ if (error)
+ goto out_release_clone;
+
+ if (inherit)
+ error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+
+ out_release_clone:
+ posix_acl_release(clone);
+ return error;
+}
+
+int
+xfs_acl_chmod(struct inode *inode)
+{
+ struct posix_acl *acl, *clone;
+ int error;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ posix_acl_release(acl);
+ if (!clone)
+ return -ENOMEM;
+
+ error = posix_acl_chmod_masq(clone, inode->i_mode);
+ if (!error)
+ error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+
+ posix_acl_release(clone);
+ return error;
+}
+
+static int
+xfs_xattr_acl_get(struct dentry *dentry, const char *name,
+ void *value, size_t size, int type)
+{
+ struct posix_acl *acl;
+ int error;
+
+ acl = xfs_get_acl(dentry->d_inode, type);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl == NULL)
+ return -ENODATA;
+
+ error = posix_acl_to_xattr(acl, value, size);
+ posix_acl_release(acl);
+
+ return error;
+}
+
+static int
+xfs_xattr_acl_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ struct inode *inode = dentry->d_inode;
+ struct posix_acl *acl = NULL;
+ int error = 0;
+
+ if (flags & XATTR_CREATE)
+ return -EINVAL;
+ if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ return value ? -EACCES : 0;
+ if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EPERM;
+
+ if (!value)
+ goto set_acl;
+
+ acl = posix_acl_from_xattr(value, size);
+ if (!acl) {
+ /*
+ * acl_set_file(3) may request that we set default ACLs with
+ * zero length -- defend (gracefully) against that here.
+ */
+ goto out;
+ }
+ if (IS_ERR(acl)) {
+ error = PTR_ERR(acl);
+ goto out;
+ }
+
+ error = posix_acl_valid(acl);
+ if (error)
+ goto out_release;
+
+ error = -EINVAL;
+ if (acl->a_count > XFS_ACL_MAX_ENTRIES)
+ goto out_release;
+
+ if (type == ACL_TYPE_ACCESS) {
+ mode_t mode = inode->i_mode;
+ error = posix_acl_equiv_mode(acl, &mode);
+
+ if (error <= 0) {
+ posix_acl_release(acl);
+ acl = NULL;
+
+ if (error < 0)
+ return error;
+ }
+
+ error = xfs_set_mode(inode, mode);
+ if (error)
+ goto out_release;
+ }
+
+ set_acl:
+ error = xfs_set_acl(inode, type, acl);
+ out_release:
+ posix_acl_release(acl);
+ out:
+ return error;
+}
+
+const struct xattr_handler xfs_xattr_acl_access_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
+ .get = xfs_xattr_acl_get,
+ .set = xfs_xattr_acl_set,
+};
+
+const struct xattr_handler xfs_xattr_acl_default_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .flags = ACL_TYPE_DEFAULT,
+ .get = xfs_xattr_acl_get,
+ .set = xfs_xattr_acl_set,
+};
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
new file mode 100644
index 00000000..79ce38be
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -0,0 +1,1506 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_trans.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_iomap.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include <linux/gfp.h>
+#include <linux/mpage.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+
+/*
+ * Prime number of hash buckets since address is used as the key.
+ */
+#define NVSYNC 37
+#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
+static wait_queue_head_t xfs_ioend_wq[NVSYNC];
+
+void __init
+xfs_ioend_init(void)
+{
+ int i;
+
+ for (i = 0; i < NVSYNC; i++)
+ init_waitqueue_head(&xfs_ioend_wq[i]);
+}
+
+void
+xfs_ioend_wait(
+ xfs_inode_t *ip)
+{
+ wait_queue_head_t *wq = to_ioend_wq(ip);
+
+ wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
+}
+
+STATIC void
+xfs_ioend_wake(
+ xfs_inode_t *ip)
+{
+ if (atomic_dec_and_test(&ip->i_iocount))
+ wake_up(to_ioend_wq(ip));
+}
+
+void
+xfs_count_page_state(
+ struct page *page,
+ int *delalloc,
+ int *unwritten)
+{
+ struct buffer_head *bh, *head;
+
+ *delalloc = *unwritten = 0;
+
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_unwritten(bh))
+ (*unwritten) = 1;
+ else if (buffer_delay(bh))
+ (*delalloc) = 1;
+ } while ((bh = bh->b_this_page) != head);
+}
+
+STATIC struct block_device *
+xfs_find_bdev_for_inode(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (XFS_IS_REALTIME_INODE(ip))
+ return mp->m_rtdev_targp->bt_bdev;
+ else
+ return mp->m_ddev_targp->bt_bdev;
+}
+
+/*
+ * We're now finished for good with this ioend structure.
+ * Update the page state via the associated buffer_heads,
+ * release holds on the inode and bio, and finally free
+ * up memory. Do not use the ioend after this.
+ */
+STATIC void
+xfs_destroy_ioend(
+ xfs_ioend_t *ioend)
+{
+ struct buffer_head *bh, *next;
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+
+ for (bh = ioend->io_buffer_head; bh; bh = next) {
+ next = bh->b_private;
+ bh->b_end_io(bh, !ioend->io_error);
+ }
+
+ /*
+ * Volume managers supporting multiple paths can send back ENODEV
+ * when the final path disappears. In this case continuing to fill
+ * the page cache with dirty data which cannot be written out is
+ * evil, so prevent that.
+ */
+ if (unlikely(ioend->io_error == -ENODEV)) {
+ xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
+ __FILE__, __LINE__);
+ }
+
+ xfs_ioend_wake(ip);
+ mempool_free(ioend, xfs_ioend_pool);
+}
+
+/*
+ * If the end of the current ioend is beyond the current EOF,
+ * return the new EOF value, otherwise zero.
+ */
+STATIC xfs_fsize_t
+xfs_ioend_new_eof(
+ xfs_ioend_t *ioend)
+{
+ xfs_inode_t *ip = XFS_I(ioend->io_inode);
+ xfs_fsize_t isize;
+ xfs_fsize_t bsize;
+
+ bsize = ioend->io_offset + ioend->io_size;
+ isize = MAX(ip->i_size, ip->i_new_size);
+ isize = MIN(isize, bsize);
+ return isize > ip->i_d.di_size ? isize : 0;
+}
+
+/*
+ * Update on-disk file size now that data has been written to disk. The
+ * current in-memory file size is i_size. If a write is beyond eof i_new_size
+ * will be the intended file size until i_size is updated. If this write does
+ * not extend all the way to the valid file size then restrict this update to
+ * the end of the write.
+ *
+ * This function does not block as blocking on the inode lock in IO completion
+ * can lead to IO completion order dependency deadlocks.. If it can't get the
+ * inode ilock it will return EAGAIN. Callers must handle this.
+ */
+STATIC int
+xfs_setfilesize(
+ xfs_ioend_t *ioend)
+{
+ xfs_inode_t *ip = XFS_I(ioend->io_inode);
+ xfs_fsize_t isize;
+
+ if (unlikely(ioend->io_error))
+ return 0;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ return EAGAIN;
+
+ isize = xfs_ioend_new_eof(ioend);
+ if (isize) {
+ ip->i_d.di_size = isize;
+ xfs_mark_inode_dirty(ip);
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/*
+ * Schedule IO completion handling on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend(
+ struct xfs_ioend *ioend)
+{
+ if (atomic_dec_and_test(&ioend->io_remaining)) {
+ if (ioend->io_type == IO_UNWRITTEN)
+ queue_work(xfsconvertd_workqueue, &ioend->io_work);
+ else
+ queue_work(xfsdatad_workqueue, &ioend->io_work);
+ }
+}
+
+/*
+ * IO write completion.
+ */
+STATIC void
+xfs_end_io(
+ struct work_struct *work)
+{
+ xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ int error = 0;
+
+ /*
+ * For unwritten extents we need to issue transactions to convert a
+ * range to normal written extens after the data I/O has finished.
+ */
+ if (ioend->io_type == IO_UNWRITTEN &&
+ likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
+
+ error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+ ioend->io_size);
+ if (error)
+ ioend->io_error = error;
+ }
+
+ /*
+ * We might have to update the on-disk file size after extending
+ * writes.
+ */
+ error = xfs_setfilesize(ioend);
+ ASSERT(!error || error == EAGAIN);
+
+ /*
+ * If we didn't complete processing of the ioend, requeue it to the
+ * tail of the workqueue for another attempt later. Otherwise destroy
+ * it.
+ */
+ if (error == EAGAIN) {
+ atomic_inc(&ioend->io_remaining);
+ xfs_finish_ioend(ioend);
+ /* ensure we don't spin on blocked ioends */
+ delay(1);
+ } else {
+ if (ioend->io_iocb)
+ aio_complete(ioend->io_iocb, ioend->io_result, 0);
+ xfs_destroy_ioend(ioend);
+ }
+}
+
+/*
+ * Call IO completion handling in caller context on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend_sync(
+ struct xfs_ioend *ioend)
+{
+ if (atomic_dec_and_test(&ioend->io_remaining))
+ xfs_end_io(&ioend->io_work);
+}
+
+/*
+ * Allocate and initialise an IO completion structure.
+ * We need to track unwritten extent write completion here initially.
+ * We'll need to extend this for updating the ondisk inode size later
+ * (vs. incore size).
+ */
+STATIC xfs_ioend_t *
+xfs_alloc_ioend(
+ struct inode *inode,
+ unsigned int type)
+{
+ xfs_ioend_t *ioend;
+
+ ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
+
+ /*
+ * Set the count to 1 initially, which will prevent an I/O
+ * completion callback from happening before we have started
+ * all the I/O from calling the completion routine too early.
+ */
+ atomic_set(&ioend->io_remaining, 1);
+ ioend->io_error = 0;
+ ioend->io_list = NULL;
+ ioend->io_type = type;
+ ioend->io_inode = inode;
+ ioend->io_buffer_head = NULL;
+ ioend->io_buffer_tail = NULL;
+ atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
+ ioend->io_offset = 0;
+ ioend->io_size = 0;
+ ioend->io_iocb = NULL;
+ ioend->io_result = 0;
+
+ INIT_WORK(&ioend->io_work, xfs_end_io);
+ return ioend;
+}
+
+STATIC int
+xfs_map_blocks(
+ struct inode *inode,
+ loff_t offset,
+ struct xfs_bmbt_irec *imap,
+ int type,
+ int nonblocking)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t count = 1 << inode->i_blkbits;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int error = 0;
+ int bmapi_flags = XFS_BMAPI_ENTIRE;
+ int nimaps = 1;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ if (type == IO_UNWRITTEN)
+ bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ if (nonblocking)
+ return -XFS_ERROR(EAGAIN);
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ }
+
+ ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+ (ip->i_df.if_flags & XFS_IFEXTENTS));
+ ASSERT(offset <= mp->m_maxioffset);
+
+ if (offset + count > mp->m_maxioffset)
+ count = mp->m_maxioffset - offset;
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+ bmapi_flags, NULL, 0, imap, &nimaps, NULL);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (error)
+ return -XFS_ERROR(error);
+
+ if (type == IO_DELALLOC &&
+ (!nimaps || isnullstartblock(imap->br_startblock))) {
+ error = xfs_iomap_write_allocate(ip, offset, count, imap);
+ if (!error)
+ trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+ return -XFS_ERROR(error);
+ }
+
+#ifdef DEBUG
+ if (type == IO_UNWRITTEN) {
+ ASSERT(nimaps);
+ ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+ ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+ }
+#endif
+ if (nimaps)
+ trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+ return 0;
+}
+
+STATIC int
+xfs_imap_valid(
+ struct inode *inode,
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset)
+{
+ offset >>= inode->i_blkbits;
+
+ return offset >= imap->br_startoff &&
+ offset < imap->br_startoff + imap->br_blockcount;
+}
+
+/*
+ * BIO completion handler for buffered IO.
+ */
+STATIC void
+xfs_end_bio(
+ struct bio *bio,
+ int error)
+{
+ xfs_ioend_t *ioend = bio->bi_private;
+
+ ASSERT(atomic_read(&bio->bi_cnt) >= 1);
+ ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
+
+ /* Toss bio and pass work off to an xfsdatad thread */
+ bio->bi_private = NULL;
+ bio->bi_end_io = NULL;
+ bio_put(bio);
+
+ xfs_finish_ioend(ioend);
+}
+
+STATIC void
+xfs_submit_ioend_bio(
+ struct writeback_control *wbc,
+ xfs_ioend_t *ioend,
+ struct bio *bio)
+{
+ atomic_inc(&ioend->io_remaining);
+ bio->bi_private = ioend;
+ bio->bi_end_io = xfs_end_bio;
+
+ /*
+ * If the I/O is beyond EOF we mark the inode dirty immediately
+ * but don't update the inode size until I/O completion.
+ */
+ if (xfs_ioend_new_eof(ioend))
+ xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
+
+ submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
+}
+
+STATIC struct bio *
+xfs_alloc_ioend_bio(
+ struct buffer_head *bh)
+{
+ int nvecs = bio_get_nr_vecs(bh->b_bdev);
+ struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
+
+ ASSERT(bio->bi_private == NULL);
+ bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_bdev = bh->b_bdev;
+ return bio;
+}
+
+STATIC void
+xfs_start_buffer_writeback(
+ struct buffer_head *bh)
+{
+ ASSERT(buffer_mapped(bh));
+ ASSERT(buffer_locked(bh));
+ ASSERT(!buffer_delay(bh));
+ ASSERT(!buffer_unwritten(bh));
+
+ mark_buffer_async_write(bh);
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+}
+
+STATIC void
+xfs_start_page_writeback(
+ struct page *page,
+ int clear_dirty,
+ int buffers)
+{
+ ASSERT(PageLocked(page));
+ ASSERT(!PageWriteback(page));
+ if (clear_dirty)
+ clear_page_dirty_for_io(page);
+ set_page_writeback(page);
+ unlock_page(page);
+ /* If no buffers on the page are to be written, finish it here */
+ if (!buffers)
+ end_page_writeback(page);
+}
+
+static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
+{
+ return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+}
+
+/*
+ * Submit all of the bios for all of the ioends we have saved up, covering the
+ * initial writepage page and also any probed pages.
+ *
+ * Because we may have multiple ioends spanning a page, we need to start
+ * writeback on all the buffers before we submit them for I/O. If we mark the
+ * buffers as we got, then we can end up with a page that only has buffers
+ * marked async write and I/O complete on can occur before we mark the other
+ * buffers async write.
+ *
+ * The end result of this is that we trip a bug in end_page_writeback() because
+ * we call it twice for the one page as the code in end_buffer_async_write()
+ * assumes that all buffers on the page are started at the same time.
+ *
+ * The fix is two passes across the ioend list - one to start writeback on the
+ * buffer_heads, and then submit them for I/O on the second pass.
+ */
+STATIC void
+xfs_submit_ioend(
+ struct writeback_control *wbc,
+ xfs_ioend_t *ioend)
+{
+ xfs_ioend_t *head = ioend;
+ xfs_ioend_t *next;
+ struct buffer_head *bh;
+ struct bio *bio;
+ sector_t lastblock = 0;
+
+ /* Pass 1 - start writeback */
+ do {
+ next = ioend->io_list;
+ for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
+ xfs_start_buffer_writeback(bh);
+ } while ((ioend = next) != NULL);
+
+ /* Pass 2 - submit I/O */
+ ioend = head;
+ do {
+ next = ioend->io_list;
+ bio = NULL;
+
+ for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+
+ if (!bio) {
+ retry:
+ bio = xfs_alloc_ioend_bio(bh);
+ } else if (bh->b_blocknr != lastblock + 1) {
+ xfs_submit_ioend_bio(wbc, ioend, bio);
+ goto retry;
+ }
+
+ if (bio_add_buffer(bio, bh) != bh->b_size) {
+ xfs_submit_ioend_bio(wbc, ioend, bio);
+ goto retry;
+ }
+
+ lastblock = bh->b_blocknr;
+ }
+ if (bio)
+ xfs_submit_ioend_bio(wbc, ioend, bio);
+ xfs_finish_ioend(ioend);
+ } while ((ioend = next) != NULL);
+}
+
+/*
+ * Cancel submission of all buffer_heads so far in this endio.
+ * Toss the endio too. Only ever called for the initial page
+ * in a writepage request, so only ever one page.
+ */
+STATIC void
+xfs_cancel_ioend(
+ xfs_ioend_t *ioend)
+{
+ xfs_ioend_t *next;
+ struct buffer_head *bh, *next_bh;
+
+ do {
+ next = ioend->io_list;
+ bh = ioend->io_buffer_head;
+ do {
+ next_bh = bh->b_private;
+ clear_buffer_async_write(bh);
+ unlock_buffer(bh);
+ } while ((bh = next_bh) != NULL);
+
+ xfs_ioend_wake(XFS_I(ioend->io_inode));
+ mempool_free(ioend, xfs_ioend_pool);
+ } while ((ioend = next) != NULL);
+}
+
+/*
+ * Test to see if we've been building up a completion structure for
+ * earlier buffers -- if so, we try to append to this ioend if we
+ * can, otherwise we finish off any current ioend and start another.
+ * Return true if we've finished the given ioend.
+ */
+STATIC void
+xfs_add_to_ioend(
+ struct inode *inode,
+ struct buffer_head *bh,
+ xfs_off_t offset,
+ unsigned int type,
+ xfs_ioend_t **result,
+ int need_ioend)
+{
+ xfs_ioend_t *ioend = *result;
+
+ if (!ioend || need_ioend || type != ioend->io_type) {
+ xfs_ioend_t *previous = *result;
+
+ ioend = xfs_alloc_ioend(inode, type);
+ ioend->io_offset = offset;
+ ioend->io_buffer_head = bh;
+ ioend->io_buffer_tail = bh;
+ if (previous)
+ previous->io_list = ioend;
+ *result = ioend;
+ } else {
+ ioend->io_buffer_tail->b_private = bh;
+ ioend->io_buffer_tail = bh;
+ }
+
+ bh->b_private = NULL;
+ ioend->io_size += bh->b_size;
+}
+
+STATIC void
+xfs_map_buffer(
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset)
+{
+ sector_t bn;
+ struct xfs_mount *m = XFS_I(inode)->i_mount;
+ xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+ xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
+
+ ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+ ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+ bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
+ ((offset - iomap_offset) >> inode->i_blkbits);
+
+ ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
+
+ bh->b_blocknr = bn;
+ set_buffer_mapped(bh);
+}
+
+STATIC void
+xfs_map_at_offset(
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset)
+{
+ ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+ ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+ xfs_map_buffer(inode, bh, imap, offset);
+ set_buffer_mapped(bh);
+ clear_buffer_delay(bh);
+ clear_buffer_unwritten(bh);
+}
+
+/*
+ * Test if a given page is suitable for writing as part of an unwritten
+ * or delayed allocate extent.
+ */
+STATIC int
+xfs_is_delayed_page(
+ struct page *page,
+ unsigned int type)
+{
+ if (PageWriteback(page))
+ return 0;
+
+ if (page->mapping && page_has_buffers(page)) {
+ struct buffer_head *bh, *head;
+ int acceptable = 0;
+
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_unwritten(bh))
+ acceptable = (type == IO_UNWRITTEN);
+ else if (buffer_delay(bh))
+ acceptable = (type == IO_DELALLOC);
+ else if (buffer_dirty(bh) && buffer_mapped(bh))
+ acceptable = (type == IO_OVERWRITE);
+ else
+ break;
+ } while ((bh = bh->b_this_page) != head);
+
+ if (acceptable)
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Allocate & map buffers for page given the extent map. Write it out.
+ * except for the original page of a writepage, this is called on
+ * delalloc/unwritten pages only, for the original page it is possible
+ * that the page has no mapping at all.
+ */
+STATIC int
+xfs_convert_page(
+ struct inode *inode,
+ struct page *page,
+ loff_t tindex,
+ struct xfs_bmbt_irec *imap,
+ xfs_ioend_t **ioendp,
+ struct writeback_control *wbc)
+{
+ struct buffer_head *bh, *head;
+ xfs_off_t end_offset;
+ unsigned long p_offset;
+ unsigned int type;
+ int len, page_dirty;
+ int count = 0, done = 0, uptodate = 1;
+ xfs_off_t offset = page_offset(page);
+
+ if (page->index != tindex)
+ goto fail;
+ if (!trylock_page(page))
+ goto fail;
+ if (PageWriteback(page))
+ goto fail_unlock_page;
+ if (page->mapping != inode->i_mapping)
+ goto fail_unlock_page;
+ if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
+ goto fail_unlock_page;
+
+ /*
+ * page_dirty is initially a count of buffers on the page before
+ * EOF and is decremented as we move each into a cleanable state.
+ *
+ * Derivation:
+ *
+ * End offset is the highest offset that this page should represent.
+ * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
+ * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
+ * hence give us the correct page_dirty count. On any other page,
+ * it will be zero and in that case we need page_dirty to be the
+ * count of buffers on the page.
+ */
+ end_offset = min_t(unsigned long long,
+ (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+ i_size_read(inode));
+
+ len = 1 << inode->i_blkbits;
+ p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
+ PAGE_CACHE_SIZE);
+ p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
+ page_dirty = p_offset / len;
+
+ bh = head = page_buffers(page);
+ do {
+ if (offset >= end_offset)
+ break;
+ if (!buffer_uptodate(bh))
+ uptodate = 0;
+ if (!(PageUptodate(page) || buffer_uptodate(bh))) {
+ done = 1;
+ continue;
+ }
+
+ if (buffer_unwritten(bh) || buffer_delay(bh) ||
+ buffer_mapped(bh)) {
+ if (buffer_unwritten(bh))
+ type = IO_UNWRITTEN;
+ else if (buffer_delay(bh))
+ type = IO_DELALLOC;
+ else
+ type = IO_OVERWRITE;
+
+ if (!xfs_imap_valid(inode, imap, offset)) {
+ done = 1;
+ continue;
+ }
+
+ lock_buffer(bh);
+ if (type != IO_OVERWRITE)
+ xfs_map_at_offset(inode, bh, imap, offset);
+ xfs_add_to_ioend(inode, bh, offset, type,
+ ioendp, done);
+
+ page_dirty--;
+ count++;
+ } else {
+ done = 1;
+ }
+ } while (offset += len, (bh = bh->b_this_page) != head);
+
+ if (uptodate && bh == head)
+ SetPageUptodate(page);
+
+ if (count) {
+ if (--wbc->nr_to_write <= 0 &&
+ wbc->sync_mode == WB_SYNC_NONE)
+ done = 1;
+ }
+ xfs_start_page_writeback(page, !page_dirty, count);
+
+ return done;
+ fail_unlock_page:
+ unlock_page(page);
+ fail:
+ return 1;
+}
+
+/*
+ * Convert & write out a cluster of pages in the same extent as defined
+ * by mp and following the start page.
+ */
+STATIC void
+xfs_cluster_write(
+ struct inode *inode,
+ pgoff_t tindex,
+ struct xfs_bmbt_irec *imap,
+ xfs_ioend_t **ioendp,
+ struct writeback_control *wbc,
+ pgoff_t tlast)
+{
+ struct pagevec pvec;
+ int done = 0, i;
+
+ pagevec_init(&pvec, 0);
+ while (!done && tindex <= tlast) {
+ unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
+
+ if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
+ break;
+
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ done = xfs_convert_page(inode, pvec.pages[i], tindex++,
+ imap, ioendp, wbc);
+ if (done)
+ break;
+ }
+
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+STATIC void
+xfs_vm_invalidatepage(
+ struct page *page,
+ unsigned long offset)
+{
+ trace_xfs_invalidatepage(page->mapping->host, page, offset);
+ block_invalidatepage(page, offset);
+}
+
+/*
+ * If the page has delalloc buffers on it, we need to punch them out before we
+ * invalidate the page. If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
+ * is done on that same region - the delalloc extent is returned when none is
+ * supposed to be there.
+ *
+ * We prevent this by truncating away the delalloc regions on the page before
+ * invalidating it. Because they are delalloc, we can do this without needing a
+ * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
+ * truncation without a transaction as there is no space left for block
+ * reservation (typically why we see a ENOSPC in writeback).
+ *
+ * This is not a performance critical path, so for now just do the punching a
+ * buffer head at a time.
+ */
+STATIC void
+xfs_aops_discard_page(
+ struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct buffer_head *bh, *head;
+ loff_t offset = page_offset(page);
+
+ if (!xfs_is_delayed_page(page, IO_DELALLOC))
+ goto out_invalidate;
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ goto out_invalidate;
+
+ xfs_alert(ip->i_mount,
+ "page discard on page %p, inode 0x%llx, offset %llu.",
+ page, ip->i_ino, offset);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ bh = head = page_buffers(page);
+ do {
+ int error;
+ xfs_fileoff_t start_fsb;
+
+ if (!buffer_delay(bh))
+ goto next_buffer;
+
+ start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_alert(ip->i_mount,
+ "page discard unable to remove delalloc mapping.");
+ }
+ break;
+ }
+next_buffer:
+ offset += 1 << inode->i_blkbits;
+
+ } while ((bh = bh->b_this_page) != head);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_invalidate:
+ xfs_vm_invalidatepage(page, 0);
+ return;
+}
+
+/*
+ * Write out a dirty page.
+ *
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ * For any other dirty buffer heads on the page we should flush them.
+ *
+ * If we detect that a transaction would be required to flush the page, we
+ * have to check the process flags first, if we are already in a transaction
+ * or disk I/O during allocations is off, we need to fail the writepage and
+ * redirty the page.
+ */
+STATIC int
+xfs_vm_writepage(
+ struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ int delalloc, unwritten;
+ struct buffer_head *bh, *head;
+ struct xfs_bmbt_irec imap;
+ xfs_ioend_t *ioend = NULL, *iohead = NULL;
+ loff_t offset;
+ unsigned int type;
+ __uint64_t end_offset;
+ pgoff_t end_index, last_index;
+ ssize_t len;
+ int err, imap_valid = 0, uptodate = 1;
+ int count = 0;
+ int nonblocking = 0;
+
+ trace_xfs_writepage(inode, page, 0);
+
+ ASSERT(page_has_buffers(page));
+
+ /*
+ * Refuse to write the page out if we are called from reclaim context.
+ *
+ * This avoids stack overflows when called from deeply used stacks in
+ * random callers for direct reclaim or memcg reclaim. We explicitly
+ * allow reclaim from kswapd as the stack usage there is relatively low.
+ *
+ * This should really be done by the core VM, but until that happens
+ * filesystems like XFS, btrfs and ext4 have to take care of this
+ * by themselves.
+ */
+ if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+ goto redirty;
+
+ /*
+ * We need a transaction if there are delalloc or unwritten buffers
+ * on the page.
+ *
+ * If we need a transaction and the process flags say we are already
+ * in a transaction, or no IO is allowed then mark the page dirty
+ * again and leave the page as is.
+ */
+ xfs_count_page_state(page, &delalloc, &unwritten);
+ if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
+ goto redirty;
+
+ /* Is this page beyond the end of the file? */
+ offset = i_size_read(inode);
+ end_index = offset >> PAGE_CACHE_SHIFT;
+ last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
+ if (page->index >= end_index) {
+ if ((page->index >= end_index + 1) ||
+ !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ end_offset = min_t(unsigned long long,
+ (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+ offset);
+ len = 1 << inode->i_blkbits;
+
+ bh = head = page_buffers(page);
+ offset = page_offset(page);
+ type = IO_OVERWRITE;
+
+ if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+ nonblocking = 1;
+
+ do {
+ int new_ioend = 0;
+
+ if (offset >= end_offset)
+ break;
+ if (!buffer_uptodate(bh))
+ uptodate = 0;
+
+ /*
+ * set_page_dirty dirties all buffers in a page, independent
+ * of their state. The dirty state however is entirely
+ * meaningless for holes (!mapped && uptodate), so skip
+ * buffers covering holes here.
+ */
+ if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+ imap_valid = 0;
+ continue;
+ }
+
+ if (buffer_unwritten(bh)) {
+ if (type != IO_UNWRITTEN) {
+ type = IO_UNWRITTEN;
+ imap_valid = 0;
+ }
+ } else if (buffer_delay(bh)) {
+ if (type != IO_DELALLOC) {
+ type = IO_DELALLOC;
+ imap_valid = 0;
+ }
+ } else if (buffer_uptodate(bh)) {
+ if (type != IO_OVERWRITE) {
+ type = IO_OVERWRITE;
+ imap_valid = 0;
+ }
+ } else {
+ if (PageUptodate(page)) {
+ ASSERT(buffer_mapped(bh));
+ imap_valid = 0;
+ }
+ continue;
+ }
+
+ if (imap_valid)
+ imap_valid = xfs_imap_valid(inode, &imap, offset);
+ if (!imap_valid) {
+ /*
+ * If we didn't have a valid mapping then we need to
+ * put the new mapping into a separate ioend structure.
+ * This ensures non-contiguous extents always have
+ * separate ioends, which is particularly important
+ * for unwritten extent conversion at I/O completion
+ * time.
+ */
+ new_ioend = 1;
+ err = xfs_map_blocks(inode, offset, &imap, type,
+ nonblocking);
+ if (err)
+ goto error;
+ imap_valid = xfs_imap_valid(inode, &imap, offset);
+ }
+ if (imap_valid) {
+ lock_buffer(bh);
+ if (type != IO_OVERWRITE)
+ xfs_map_at_offset(inode, bh, &imap, offset);
+ xfs_add_to_ioend(inode, bh, offset, type, &ioend,
+ new_ioend);
+ count++;
+ }
+
+ if (!iohead)
+ iohead = ioend;
+
+ } while (offset += len, ((bh = bh->b_this_page) != head));
+
+ if (uptodate && bh == head)
+ SetPageUptodate(page);
+
+ xfs_start_page_writeback(page, 1, count);
+
+ if (ioend && imap_valid) {
+ xfs_off_t end_index;
+
+ end_index = imap.br_startoff + imap.br_blockcount;
+
+ /* to bytes */
+ end_index <<= inode->i_blkbits;
+
+ /* to pages */
+ end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
+
+ /* check against file size */
+ if (end_index > last_index)
+ end_index = last_index;
+
+ xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
+ wbc, end_index);
+ }
+
+ if (iohead)
+ xfs_submit_ioend(wbc, iohead);
+
+ return 0;
+
+error:
+ if (iohead)
+ xfs_cancel_ioend(iohead);
+
+ if (err == -EAGAIN)
+ goto redirty;
+
+ xfs_aops_discard_page(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ return err;
+
+redirty:
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+}
+
+STATIC int
+xfs_vm_writepages(
+ struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+ return generic_writepages(mapping, wbc);
+}
+
+/*
+ * Called to move a page into cleanable state - and from there
+ * to be released. The page should already be clean. We always
+ * have buffer heads in this call.
+ *
+ * Returns 1 if the page is ok to release, 0 otherwise.
+ */
+STATIC int
+xfs_vm_releasepage(
+ struct page *page,
+ gfp_t gfp_mask)
+{
+ int delalloc, unwritten;
+
+ trace_xfs_releasepage(page->mapping->host, page, 0);
+
+ xfs_count_page_state(page, &delalloc, &unwritten);
+
+ if (WARN_ON(delalloc))
+ return 0;
+ if (WARN_ON(unwritten))
+ return 0;
+
+ return try_to_free_buffers(page);
+}
+
+STATIC int
+__xfs_get_blocks(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ int create,
+ int direct)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb, end_fsb;
+ int error = 0;
+ int lockmode = 0;
+ struct xfs_bmbt_irec imap;
+ int nimaps = 1;
+ xfs_off_t offset;
+ ssize_t size;
+ int new = 0;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ offset = (xfs_off_t)iblock << inode->i_blkbits;
+ ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
+ size = bh_result->b_size;
+
+ if (!create && direct && offset >= i_size_read(inode))
+ return 0;
+
+ if (create) {
+ lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lockmode);
+ } else {
+ lockmode = xfs_ilock_map_shared(ip);
+ }
+
+ ASSERT(offset <= mp->m_maxioffset);
+ if (offset + size > mp->m_maxioffset)
+ size = mp->m_maxioffset - offset;
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+ error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+ XFS_BMAPI_ENTIRE, NULL, 0, &imap, &nimaps, NULL);
+ if (error)
+ goto out_unlock;
+
+ if (create &&
+ (!nimaps ||
+ (imap.br_startblock == HOLESTARTBLOCK ||
+ imap.br_startblock == DELAYSTARTBLOCK))) {
+ if (direct) {
+ error = xfs_iomap_write_direct(ip, offset, size,
+ &imap, nimaps);
+ } else {
+ error = xfs_iomap_write_delay(ip, offset, size, &imap);
+ }
+ if (error)
+ goto out_unlock;
+
+ trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+ } else if (nimaps) {
+ trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+ } else {
+ trace_xfs_get_blocks_notfound(ip, offset, size);
+ goto out_unlock;
+ }
+ xfs_iunlock(ip, lockmode);
+
+ if (imap.br_startblock != HOLESTARTBLOCK &&
+ imap.br_startblock != DELAYSTARTBLOCK) {
+ /*
+ * For unwritten extents do not report a disk address on
+ * the read case (treat as if we're reading into a hole).
+ */
+ if (create || !ISUNWRITTEN(&imap))
+ xfs_map_buffer(inode, bh_result, &imap, offset);
+ if (create && ISUNWRITTEN(&imap)) {
+ if (direct)
+ bh_result->b_private = inode;
+ set_buffer_unwritten(bh_result);
+ }
+ }
+
+ /*
+ * If this is a realtime file, data may be on a different device.
+ * to that pointed to from the buffer_head b_bdev currently.
+ */
+ bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
+
+ /*
+ * If we previously allocated a block out beyond eof and we are now
+ * coming back to use it then we will need to flag it as new even if it
+ * has a disk address.
+ *
+ * With sub-block writes into unwritten extents we also need to mark
+ * the buffer as new so that the unwritten parts of the buffer gets
+ * correctly zeroed.
+ */
+ if (create &&
+ ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
+ (offset >= i_size_read(inode)) ||
+ (new || ISUNWRITTEN(&imap))))
+ set_buffer_new(bh_result);
+
+ if (imap.br_startblock == DELAYSTARTBLOCK) {
+ BUG_ON(direct);
+ if (create) {
+ set_buffer_uptodate(bh_result);
+ set_buffer_mapped(bh_result);
+ set_buffer_delay(bh_result);
+ }
+ }
+
+ /*
+ * If this is O_DIRECT or the mpage code calling tell them how large
+ * the mapping is, so that we can avoid repeated get_blocks calls.
+ */
+ if (direct || size > (1 << inode->i_blkbits)) {
+ xfs_off_t mapping_size;
+
+ mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
+ mapping_size <<= inode->i_blkbits;
+
+ ASSERT(mapping_size > 0);
+ if (mapping_size > size)
+ mapping_size = size;
+ if (mapping_size > LONG_MAX)
+ mapping_size = LONG_MAX;
+
+ bh_result->b_size = mapping_size;
+ }
+
+ return 0;
+
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return -error;
+}
+
+int
+xfs_get_blocks(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ int create)
+{
+ return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+}
+
+STATIC int
+xfs_get_blocks_direct(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ int create)
+{
+ return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+}
+
+/*
+ * Complete a direct I/O write request.
+ *
+ * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * need to issue a transaction to convert the range from unwritten to written
+ * extents. In case this is regular synchronous I/O we just call xfs_end_io
+ * to do this and we are done. But in case this was a successful AIO
+ * request this handler is called from interrupt context, from which we
+ * can't start transactions. In that case offload the I/O completion to
+ * the workqueues we also use for buffered I/O completion.
+ */
+STATIC void
+xfs_end_io_direct_write(
+ struct kiocb *iocb,
+ loff_t offset,
+ ssize_t size,
+ void *private,
+ int ret,
+ bool is_async)
+{
+ struct xfs_ioend *ioend = iocb->private;
+
+ /*
+ * blockdev_direct_IO can return an error even after the I/O
+ * completion handler was called. Thus we need to protect
+ * against double-freeing.
+ */
+ iocb->private = NULL;
+
+ ioend->io_offset = offset;
+ ioend->io_size = size;
+ if (private && size > 0)
+ ioend->io_type = IO_UNWRITTEN;
+
+ if (is_async) {
+ /*
+ * If we are converting an unwritten extent we need to delay
+ * the AIO completion until after the unwrittent extent
+ * conversion has completed, otherwise do it ASAP.
+ */
+ if (ioend->io_type == IO_UNWRITTEN) {
+ ioend->io_iocb = iocb;
+ ioend->io_result = ret;
+ } else {
+ aio_complete(iocb, ret, 0);
+ }
+ xfs_finish_ioend(ioend);
+ } else {
+ xfs_finish_ioend_sync(ioend);
+ }
+}
+
+STATIC ssize_t
+xfs_vm_direct_IO(
+ int rw,
+ struct kiocb *iocb,
+ const struct iovec *iov,
+ loff_t offset,
+ unsigned long nr_segs)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct block_device *bdev = xfs_find_bdev_for_inode(inode);
+ ssize_t ret;
+
+ if (rw & WRITE) {
+ iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
+
+ ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
+ offset, nr_segs,
+ xfs_get_blocks_direct,
+ xfs_end_io_direct_write, NULL, 0);
+ if (ret != -EIOCBQUEUED && iocb->private)
+ xfs_destroy_ioend(iocb->private);
+ } else {
+ ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
+ offset, nr_segs,
+ xfs_get_blocks_direct,
+ NULL, NULL, 0);
+ }
+
+ return ret;
+}
+
+STATIC void
+xfs_vm_write_failed(
+ struct address_space *mapping,
+ loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ /*
+ * punch out the delalloc blocks we have already allocated. We
+ * don't call xfs_setattr() to do this as we may be in the
+ * middle of a multi-iovec write and so the vfs inode->i_size
+ * will not match the xfs ip->i_size and so it will zero too
+ * much. Hence we jus truncate the page cache to zero what is
+ * necessary and punch the delalloc blocks directly.
+ */
+ struct xfs_inode *ip = XFS_I(inode);
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t end_fsb;
+ int error;
+
+ truncate_pagecache(inode, to, inode->i_size);
+
+ /*
+ * Check if there are any blocks that are outside of i_size
+ * that need to be trimmed back.
+ */
+ start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+ end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+ if (end_fsb <= start_fsb)
+ return;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+ end_fsb - start_fsb);
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_alert(ip->i_mount,
+ "xfs_vm_write_failed: unable to clean up ino %lld",
+ ip->i_ino);
+ }
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+}
+
+STATIC int
+xfs_vm_write_begin(
+ struct file *file,
+ struct address_space *mapping,
+ loff_t pos,
+ unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata)
+{
+ int ret;
+
+ ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
+ pagep, xfs_get_blocks);
+ if (unlikely(ret))
+ xfs_vm_write_failed(mapping, pos + len);
+ return ret;
+}
+
+STATIC int
+xfs_vm_write_end(
+ struct file *file,
+ struct address_space *mapping,
+ loff_t pos,
+ unsigned len,
+ unsigned copied,
+ struct page *page,
+ void *fsdata)
+{
+ int ret;
+
+ ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (unlikely(ret < len))
+ xfs_vm_write_failed(mapping, pos + len);
+ return ret;
+}
+
+STATIC sector_t
+xfs_vm_bmap(
+ struct address_space *mapping,
+ sector_t block)
+{
+ struct inode *inode = (struct inode *)mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+
+ trace_xfs_vm_bmap(XFS_I(inode));
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return generic_block_bmap(mapping, block, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpage(
+ struct file *unused,
+ struct page *page)
+{
+ return mpage_readpage(page, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpages(
+ struct file *unused,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
+}
+
+const struct address_space_operations xfs_address_space_operations = {
+ .readpage = xfs_vm_readpage,
+ .readpages = xfs_vm_readpages,
+ .writepage = xfs_vm_writepage,
+ .writepages = xfs_vm_writepages,
+ .releasepage = xfs_vm_releasepage,
+ .invalidatepage = xfs_vm_invalidatepage,
+ .write_begin = xfs_vm_write_begin,
+ .write_end = xfs_vm_write_end,
+ .bmap = xfs_vm_bmap,
+ .direct_IO = xfs_vm_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+};
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
new file mode 100644
index 00000000..71f721e1
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2005-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_AOPS_H__
+#define __XFS_AOPS_H__
+
+extern struct workqueue_struct *xfsdatad_workqueue;
+extern struct workqueue_struct *xfsconvertd_workqueue;
+extern mempool_t *xfs_ioend_pool;
+
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+ IO_DIRECT = 0, /* special case for direct I/O ioends */
+ IO_DELALLOC, /* mapping covers delalloc region */
+ IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
+ IO_OVERWRITE, /* mapping covers already allocated extent */
+};
+
+#define XFS_IO_TYPES \
+ { 0, "" }, \
+ { IO_DELALLOC, "delalloc" }, \
+ { IO_UNWRITTEN, "unwritten" }, \
+ { IO_OVERWRITE, "overwrite" }
+
+/*
+ * xfs_ioend struct manages large extent writes for XFS.
+ * It can manage several multi-page bio's at once.
+ */
+typedef struct xfs_ioend {
+ struct xfs_ioend *io_list; /* next ioend in chain */
+ unsigned int io_type; /* delalloc / unwritten */
+ int io_error; /* I/O error code */
+ atomic_t io_remaining; /* hold count */
+ struct inode *io_inode; /* file being written to */
+ struct buffer_head *io_buffer_head;/* buffer linked list head */
+ struct buffer_head *io_buffer_tail;/* buffer linked list tail */
+ size_t io_size; /* size of the extent */
+ xfs_off_t io_offset; /* offset in the file */
+ struct work_struct io_work; /* xfsdatad work queue */
+ struct kiocb *io_iocb;
+ int io_result;
+} xfs_ioend_t;
+
+extern const struct address_space_operations xfs_address_space_operations;
+extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+extern void xfs_ioend_init(void);
+extern void xfs_ioend_wait(struct xfs_inode *);
+
+extern void xfs_count_page_state(struct page *, int *, int *);
+
+#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
new file mode 100644
index 00000000..5e68099d
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -0,0 +1,1899 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/bio.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/percpu.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+#include <linux/kthread.h>
+#include <linux/migrate.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
+
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+
+static kmem_zone_t *xfs_buf_zone;
+STATIC int xfsbufd(void *);
+STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
+
+static struct workqueue_struct *xfslogd_workqueue;
+struct workqueue_struct *xfsdatad_workqueue;
+struct workqueue_struct *xfsconvertd_workqueue;
+
+#ifdef XFS_BUF_LOCK_TRACKING
+# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
+# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
+# define XB_GET_OWNER(bp) ((bp)->b_last_holder)
+#else
+# define XB_SET_OWNER(bp) do { } while (0)
+# define XB_CLEAR_OWNER(bp) do { } while (0)
+# define XB_GET_OWNER(bp) do { } while (0)
+#endif
+
+#define xb_to_gfp(flags) \
+ ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
+ ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
+
+#define xb_to_km(flags) \
+ (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
+
+#define xfs_buf_allocate(flags) \
+ kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
+#define xfs_buf_deallocate(bp) \
+ kmem_zone_free(xfs_buf_zone, (bp));
+
+static inline int
+xfs_buf_is_vmapped(
+ struct xfs_buf *bp)
+{
+ /*
+ * Return true if the buffer is vmapped.
+ *
+ * The XBF_MAPPED flag is set if the buffer should be mapped, but the
+ * code is clever enough to know it doesn't have to map a single page,
+ * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
+ */
+ return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
+}
+
+static inline int
+xfs_buf_vmap_len(
+ struct xfs_buf *bp)
+{
+ return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
+}
+
+/*
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
+ */
+STATIC void
+xfs_buf_lru_add(
+ struct xfs_buf *bp)
+{
+ struct xfs_buftarg *btp = bp->b_target;
+
+ spin_lock(&btp->bt_lru_lock);
+ if (list_empty(&bp->b_lru)) {
+ atomic_inc(&bp->b_hold);
+ list_add_tail(&bp->b_lru, &btp->bt_lru);
+ btp->bt_lru_nr++;
+ }
+ spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+ struct xfs_buf *bp)
+{
+ struct xfs_buftarg *btp = bp->b_target;
+
+ if (list_empty(&bp->b_lru))
+ return;
+
+ spin_lock(&btp->bt_lru_lock);
+ if (!list_empty(&bp->b_lru)) {
+ list_del_init(&bp->b_lru);
+ btp->bt_lru_nr--;
+ }
+ spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+ struct xfs_buf *bp)
+{
+ bp->b_flags |= XBF_STALE;
+ atomic_set(&(bp)->b_lru_ref, 0);
+ if (!list_empty(&bp->b_lru)) {
+ struct xfs_buftarg *btp = bp->b_target;
+
+ spin_lock(&btp->bt_lru_lock);
+ if (!list_empty(&bp->b_lru)) {
+ list_del_init(&bp->b_lru);
+ btp->bt_lru_nr--;
+ atomic_dec(&bp->b_hold);
+ }
+ spin_unlock(&btp->bt_lru_lock);
+ }
+ ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
+
+STATIC void
+_xfs_buf_initialize(
+ xfs_buf_t *bp,
+ xfs_buftarg_t *target,
+ xfs_off_t range_base,
+ size_t range_length,
+ xfs_buf_flags_t flags)
+{
+ /*
+ * We don't want certain flags to appear in b_flags.
+ */
+ flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
+
+ memset(bp, 0, sizeof(xfs_buf_t));
+ atomic_set(&bp->b_hold, 1);
+ atomic_set(&bp->b_lru_ref, 1);
+ init_completion(&bp->b_iowait);
+ INIT_LIST_HEAD(&bp->b_lru);
+ INIT_LIST_HEAD(&bp->b_list);
+ RB_CLEAR_NODE(&bp->b_rbnode);
+ sema_init(&bp->b_sema, 0); /* held, no waiters */
+ XB_SET_OWNER(bp);
+ bp->b_target = target;
+ bp->b_file_offset = range_base;
+ /*
+ * Set buffer_length and count_desired to the same value initially.
+ * I/O routines should use count_desired, which will be the same in
+ * most cases but may be reset (e.g. XFS recovery).
+ */
+ bp->b_buffer_length = bp->b_count_desired = range_length;
+ bp->b_flags = flags;
+ bp->b_bn = XFS_BUF_DADDR_NULL;
+ atomic_set(&bp->b_pin_count, 0);
+ init_waitqueue_head(&bp->b_waiters);
+
+ XFS_STATS_INC(xb_create);
+
+ trace_xfs_buf_init(bp, _RET_IP_);
+}
+
+/*
+ * Allocate a page array capable of holding a specified number
+ * of pages, and point the page buf at it.
+ */
+STATIC int
+_xfs_buf_get_pages(
+ xfs_buf_t *bp,
+ int page_count,
+ xfs_buf_flags_t flags)
+{
+ /* Make sure that we have a page list */
+ if (bp->b_pages == NULL) {
+ bp->b_offset = xfs_buf_poff(bp->b_file_offset);
+ bp->b_page_count = page_count;
+ if (page_count <= XB_PAGES) {
+ bp->b_pages = bp->b_page_array;
+ } else {
+ bp->b_pages = kmem_alloc(sizeof(struct page *) *
+ page_count, xb_to_km(flags));
+ if (bp->b_pages == NULL)
+ return -ENOMEM;
+ }
+ memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
+ }
+ return 0;
+}
+
+/*
+ * Frees b_pages if it was allocated.
+ */
+STATIC void
+_xfs_buf_free_pages(
+ xfs_buf_t *bp)
+{
+ if (bp->b_pages != bp->b_page_array) {
+ kmem_free(bp->b_pages);
+ bp->b_pages = NULL;
+ }
+}
+
+/*
+ * Releases the specified buffer.
+ *
+ * The modification state of any associated pages is left unchanged.
+ * The buffer most not be on any hash - use xfs_buf_rele instead for
+ * hashed and refcounted buffers
+ */
+void
+xfs_buf_free(
+ xfs_buf_t *bp)
+{
+ trace_xfs_buf_free(bp, _RET_IP_);
+
+ ASSERT(list_empty(&bp->b_lru));
+
+ if (bp->b_flags & _XBF_PAGES) {
+ uint i;
+
+ if (xfs_buf_is_vmapped(bp))
+ vm_unmap_ram(bp->b_addr - bp->b_offset,
+ bp->b_page_count);
+
+ for (i = 0; i < bp->b_page_count; i++) {
+ struct page *page = bp->b_pages[i];
+
+ __free_page(page);
+ }
+ } else if (bp->b_flags & _XBF_KMEM)
+ kmem_free(bp->b_addr);
+ _xfs_buf_free_pages(bp);
+ xfs_buf_deallocate(bp);
+}
+
+/*
+ * Allocates all the pages for buffer in question and builds it's page list.
+ */
+STATIC int
+xfs_buf_allocate_memory(
+ xfs_buf_t *bp,
+ uint flags)
+{
+ size_t size = bp->b_count_desired;
+ size_t nbytes, offset;
+ gfp_t gfp_mask = xb_to_gfp(flags);
+ unsigned short page_count, i;
+ xfs_off_t end;
+ int error;
+
+ /*
+ * for buffers that are contained within a single page, just allocate
+ * the memory from the heap - there's no need for the complexity of
+ * page arrays to keep allocation down to order 0.
+ */
+ if (bp->b_buffer_length < PAGE_SIZE) {
+ bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
+ if (!bp->b_addr) {
+ /* low memory - use alloc_page loop instead */
+ goto use_alloc_page;
+ }
+
+ if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
+ PAGE_MASK) !=
+ ((unsigned long)bp->b_addr & PAGE_MASK)) {
+ /* b_addr spans two pages - use alloc_page instead */
+ kmem_free(bp->b_addr);
+ bp->b_addr = NULL;
+ goto use_alloc_page;
+ }
+ bp->b_offset = offset_in_page(bp->b_addr);
+ bp->b_pages = bp->b_page_array;
+ bp->b_pages[0] = virt_to_page(bp->b_addr);
+ bp->b_page_count = 1;
+ bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
+ return 0;
+ }
+
+use_alloc_page:
+ end = bp->b_file_offset + bp->b_buffer_length;
+ page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
+ error = _xfs_buf_get_pages(bp, page_count, flags);
+ if (unlikely(error))
+ return error;
+
+ offset = bp->b_offset;
+ bp->b_flags |= _XBF_PAGES;
+
+ for (i = 0; i < bp->b_page_count; i++) {
+ struct page *page;
+ uint retries = 0;
+retry:
+ page = alloc_page(gfp_mask);
+ if (unlikely(page == NULL)) {
+ if (flags & XBF_READ_AHEAD) {
+ bp->b_page_count = i;
+ error = ENOMEM;
+ goto out_free_pages;
+ }
+
+ /*
+ * This could deadlock.
+ *
+ * But until all the XFS lowlevel code is revamped to
+ * handle buffer allocation failures we can't do much.
+ */
+ if (!(++retries % 100))
+ xfs_err(NULL,
+ "possible memory allocation deadlock in %s (mode:0x%x)",
+ __func__, gfp_mask);
+
+ XFS_STATS_INC(xb_page_retries);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+
+ XFS_STATS_INC(xb_page_found);
+
+ nbytes = min_t(size_t, size, PAGE_SIZE - offset);
+ size -= nbytes;
+ bp->b_pages[i] = page;
+ offset = 0;
+ }
+ return 0;
+
+out_free_pages:
+ for (i = 0; i < bp->b_page_count; i++)
+ __free_page(bp->b_pages[i]);
+ return error;
+}
+
+/*
+ * Map buffer into kernel address-space if necessary.
+ */
+STATIC int
+_xfs_buf_map_pages(
+ xfs_buf_t *bp,
+ uint flags)
+{
+ ASSERT(bp->b_flags & _XBF_PAGES);
+ if (bp->b_page_count == 1) {
+ /* A single page buffer is always mappable */
+ bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
+ bp->b_flags |= XBF_MAPPED;
+ } else if (flags & XBF_MAPPED) {
+ int retried = 0;
+
+ do {
+ bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+ -1, PAGE_KERNEL);
+ if (bp->b_addr)
+ break;
+ vm_unmap_aliases();
+ } while (retried++ <= 1);
+
+ if (!bp->b_addr)
+ return -ENOMEM;
+ bp->b_addr += bp->b_offset;
+ bp->b_flags |= XBF_MAPPED;
+ }
+
+ return 0;
+}
+
+/*
+ * Finding and Reading Buffers
+ */
+
+/*
+ * Look up, and creates if absent, a lockable buffer for
+ * a given range of an inode. The buffer is returned
+ * locked. If other overlapping buffers exist, they are
+ * released before the new buffer is created and locked,
+ * which may imply that this call will block until those buffers
+ * are unlocked. No I/O is implied by this call.
+ */
+xfs_buf_t *
+_xfs_buf_find(
+ xfs_buftarg_t *btp, /* block device target */
+ xfs_off_t ioff, /* starting offset of range */
+ size_t isize, /* length of range */
+ xfs_buf_flags_t flags,
+ xfs_buf_t *new_bp)
+{
+ xfs_off_t range_base;
+ size_t range_length;
+ struct xfs_perag *pag;
+ struct rb_node **rbp;
+ struct rb_node *parent;
+ xfs_buf_t *bp;
+
+ range_base = (ioff << BBSHIFT);
+ range_length = (isize << BBSHIFT);
+
+ /* Check for IOs smaller than the sector size / not sector aligned */
+ ASSERT(!(range_length < (1 << btp->bt_sshift)));
+ ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
+
+ /* get tree root */
+ pag = xfs_perag_get(btp->bt_mount,
+ xfs_daddr_to_agno(btp->bt_mount, ioff));
+
+ /* walk tree */
+ spin_lock(&pag->pag_buf_lock);
+ rbp = &pag->pag_buf_tree.rb_node;
+ parent = NULL;
+ bp = NULL;
+ while (*rbp) {
+ parent = *rbp;
+ bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+ if (range_base < bp->b_file_offset)
+ rbp = &(*rbp)->rb_left;
+ else if (range_base > bp->b_file_offset)
+ rbp = &(*rbp)->rb_right;
+ else {
+ /*
+ * found a block offset match. If the range doesn't
+ * match, the only way this is allowed is if the buffer
+ * in the cache is stale and the transaction that made
+ * it stale has not yet committed. i.e. we are
+ * reallocating a busy extent. Skip this buffer and
+ * continue searching to the right for an exact match.
+ */
+ if (bp->b_buffer_length != range_length) {
+ ASSERT(bp->b_flags & XBF_STALE);
+ rbp = &(*rbp)->rb_right;
+ continue;
+ }
+ atomic_inc(&bp->b_hold);
+ goto found;
+ }
+ }
+
+ /* No match found */
+ if (new_bp) {
+ _xfs_buf_initialize(new_bp, btp, range_base,
+ range_length, flags);
+ rb_link_node(&new_bp->b_rbnode, parent, rbp);
+ rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+ /* the buffer keeps the perag reference until it is freed */
+ new_bp->b_pag = pag;
+ spin_unlock(&pag->pag_buf_lock);
+ } else {
+ XFS_STATS_INC(xb_miss_locked);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
+ }
+ return new_bp;
+
+found:
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
+
+ if (xfs_buf_cond_lock(bp)) {
+ /* failed, so wait for the lock if requested. */
+ if (!(flags & XBF_TRYLOCK)) {
+ xfs_buf_lock(bp);
+ XFS_STATS_INC(xb_get_locked_waited);
+ } else {
+ xfs_buf_rele(bp);
+ XFS_STATS_INC(xb_busy_locked);
+ return NULL;
+ }
+ }
+
+ /*
+ * if the buffer is stale, clear all the external state associated with
+ * it. We need to keep flags such as how we allocated the buffer memory
+ * intact here.
+ */
+ if (bp->b_flags & XBF_STALE) {
+ ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+ bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
+ }
+
+ trace_xfs_buf_find(bp, flags, _RET_IP_);
+ XFS_STATS_INC(xb_get_locked);
+ return bp;
+}
+
+/*
+ * Assembles a buffer covering the specified range.
+ * Storage in memory for all portions of the buffer will be allocated,
+ * although backing storage may not be.
+ */
+xfs_buf_t *
+xfs_buf_get(
+ xfs_buftarg_t *target,/* target for buffer */
+ xfs_off_t ioff, /* starting offset of range */
+ size_t isize, /* length of range */
+ xfs_buf_flags_t flags)
+{
+ xfs_buf_t *bp, *new_bp;
+ int error = 0;
+
+ new_bp = xfs_buf_allocate(flags);
+ if (unlikely(!new_bp))
+ return NULL;
+
+ bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
+ if (bp == new_bp) {
+ error = xfs_buf_allocate_memory(bp, flags);
+ if (error)
+ goto no_buffer;
+ } else {
+ xfs_buf_deallocate(new_bp);
+ if (unlikely(bp == NULL))
+ return NULL;
+ }
+
+ if (!(bp->b_flags & XBF_MAPPED)) {
+ error = _xfs_buf_map_pages(bp, flags);
+ if (unlikely(error)) {
+ xfs_warn(target->bt_mount,
+ "%s: failed to map pages\n", __func__);
+ goto no_buffer;
+ }
+ }
+
+ XFS_STATS_INC(xb_get);
+
+ /*
+ * Always fill in the block number now, the mapped cases can do
+ * their own overlay of this later.
+ */
+ bp->b_bn = ioff;
+ bp->b_count_desired = bp->b_buffer_length;
+
+ trace_xfs_buf_get(bp, flags, _RET_IP_);
+ return bp;
+
+ no_buffer:
+ if (flags & (XBF_LOCK | XBF_TRYLOCK))
+ xfs_buf_unlock(bp);
+ xfs_buf_rele(bp);
+ return NULL;
+}
+
+STATIC int
+_xfs_buf_read(
+ xfs_buf_t *bp,
+ xfs_buf_flags_t flags)
+{
+ int status;
+
+ ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
+ ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+
+ bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
+ XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+ bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \
+ XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+
+ status = xfs_buf_iorequest(bp);
+ if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC))
+ return status;
+ return xfs_buf_iowait(bp);
+}
+
+xfs_buf_t *
+xfs_buf_read(
+ xfs_buftarg_t *target,
+ xfs_off_t ioff,
+ size_t isize,
+ xfs_buf_flags_t flags)
+{
+ xfs_buf_t *bp;
+
+ flags |= XBF_READ;
+
+ bp = xfs_buf_get(target, ioff, isize, flags);
+ if (bp) {
+ trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+ if (!XFS_BUF_ISDONE(bp)) {
+ XFS_STATS_INC(xb_get_read);
+ _xfs_buf_read(bp, flags);
+ } else if (flags & XBF_ASYNC) {
+ /*
+ * Read ahead call which is already satisfied,
+ * drop the buffer
+ */
+ goto no_buffer;
+ } else {
+ /* We do not want read in the flags */
+ bp->b_flags &= ~XBF_READ;
+ }
+ }
+
+ return bp;
+
+ no_buffer:
+ if (flags & (XBF_LOCK | XBF_TRYLOCK))
+ xfs_buf_unlock(bp);
+ xfs_buf_rele(bp);
+ return NULL;
+}
+
+/*
+ * If we are not low on memory then do the readahead in a deadlock
+ * safe manner.
+ */
+void
+xfs_buf_readahead(
+ xfs_buftarg_t *target,
+ xfs_off_t ioff,
+ size_t isize)
+{
+ if (bdi_read_congested(target->bt_bdi))
+ return;
+
+ xfs_buf_read(target, ioff, isize,
+ XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
+}
+
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+ struct xfs_mount *mp,
+ struct xfs_buftarg *target,
+ xfs_daddr_t daddr,
+ size_t length,
+ int flags)
+{
+ xfs_buf_t *bp;
+ int error;
+
+ bp = xfs_buf_get_uncached(target, length, flags);
+ if (!bp)
+ return NULL;
+
+ /* set up the buffer for a read IO */
+ xfs_buf_lock(bp);
+ XFS_BUF_SET_ADDR(bp, daddr);
+ XFS_BUF_READ(bp);
+ XFS_BUF_BUSY(bp);
+
+ xfsbdstrat(mp, bp);
+ error = xfs_buf_iowait(bp);
+ if (error || bp->b_error) {
+ xfs_buf_relse(bp);
+ return NULL;
+ }
+ return bp;
+}
+
+xfs_buf_t *
+xfs_buf_get_empty(
+ size_t len,
+ xfs_buftarg_t *target)
+{
+ xfs_buf_t *bp;
+
+ bp = xfs_buf_allocate(0);
+ if (bp)
+ _xfs_buf_initialize(bp, target, 0, len, 0);
+ return bp;
+}
+
+/*
+ * Return a buffer allocated as an empty buffer and associated to external
+ * memory via xfs_buf_associate_memory() back to it's empty state.
+ */
+void
+xfs_buf_set_empty(
+ struct xfs_buf *bp,
+ size_t len)
+{
+ if (bp->b_pages)
+ _xfs_buf_free_pages(bp);
+
+ bp->b_pages = NULL;
+ bp->b_page_count = 0;
+ bp->b_addr = NULL;
+ bp->b_file_offset = 0;
+ bp->b_buffer_length = bp->b_count_desired = len;
+ bp->b_bn = XFS_BUF_DADDR_NULL;
+ bp->b_flags &= ~XBF_MAPPED;
+}
+
+static inline struct page *
+mem_to_page(
+ void *addr)
+{
+ if ((!is_vmalloc_addr(addr))) {
+ return virt_to_page(addr);
+ } else {
+ return vmalloc_to_page(addr);
+ }
+}
+
+int
+xfs_buf_associate_memory(
+ xfs_buf_t *bp,
+ void *mem,
+ size_t len)
+{
+ int rval;
+ int i = 0;
+ unsigned long pageaddr;
+ unsigned long offset;
+ size_t buflen;
+ int page_count;
+
+ pageaddr = (unsigned long)mem & PAGE_MASK;
+ offset = (unsigned long)mem - pageaddr;
+ buflen = PAGE_ALIGN(len + offset);
+ page_count = buflen >> PAGE_SHIFT;
+
+ /* Free any previous set of page pointers */
+ if (bp->b_pages)
+ _xfs_buf_free_pages(bp);
+
+ bp->b_pages = NULL;
+ bp->b_addr = mem;
+
+ rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
+ if (rval)
+ return rval;
+
+ bp->b_offset = offset;
+
+ for (i = 0; i < bp->b_page_count; i++) {
+ bp->b_pages[i] = mem_to_page((void *)pageaddr);
+ pageaddr += PAGE_SIZE;
+ }
+
+ bp->b_count_desired = len;
+ bp->b_buffer_length = buflen;
+ bp->b_flags |= XBF_MAPPED;
+
+ return 0;
+}
+
+xfs_buf_t *
+xfs_buf_get_uncached(
+ struct xfs_buftarg *target,
+ size_t len,
+ int flags)
+{
+ unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
+ int error, i;
+ xfs_buf_t *bp;
+
+ bp = xfs_buf_allocate(0);
+ if (unlikely(bp == NULL))
+ goto fail;
+ _xfs_buf_initialize(bp, target, 0, len, 0);
+
+ error = _xfs_buf_get_pages(bp, page_count, 0);
+ if (error)
+ goto fail_free_buf;
+
+ for (i = 0; i < page_count; i++) {
+ bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
+ if (!bp->b_pages[i])
+ goto fail_free_mem;
+ }
+ bp->b_flags |= _XBF_PAGES;
+
+ error = _xfs_buf_map_pages(bp, XBF_MAPPED);
+ if (unlikely(error)) {
+ xfs_warn(target->bt_mount,
+ "%s: failed to map pages\n", __func__);
+ goto fail_free_mem;
+ }
+
+ xfs_buf_unlock(bp);
+
+ trace_xfs_buf_get_uncached(bp, _RET_IP_);
+ return bp;
+
+ fail_free_mem:
+ while (--i >= 0)
+ __free_page(bp->b_pages[i]);
+ _xfs_buf_free_pages(bp);
+ fail_free_buf:
+ xfs_buf_deallocate(bp);
+ fail:
+ return NULL;
+}
+
+/*
+ * Increment reference count on buffer, to hold the buffer concurrently
+ * with another thread which may release (free) the buffer asynchronously.
+ * Must hold the buffer already to call this function.
+ */
+void
+xfs_buf_hold(
+ xfs_buf_t *bp)
+{
+ trace_xfs_buf_hold(bp, _RET_IP_);
+ atomic_inc(&bp->b_hold);
+}
+
+/*
+ * Releases a hold on the specified buffer. If the
+ * the hold count is 1, calls xfs_buf_free.
+ */
+void
+xfs_buf_rele(
+ xfs_buf_t *bp)
+{
+ struct xfs_perag *pag = bp->b_pag;
+
+ trace_xfs_buf_rele(bp, _RET_IP_);
+
+ if (!pag) {
+ ASSERT(list_empty(&bp->b_lru));
+ ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
+ if (atomic_dec_and_test(&bp->b_hold))
+ xfs_buf_free(bp);
+ return;
+ }
+
+ ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
+
+ ASSERT(atomic_read(&bp->b_hold) > 0);
+ if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
+ if (!(bp->b_flags & XBF_STALE) &&
+ atomic_read(&bp->b_lru_ref)) {
+ xfs_buf_lru_add(bp);
+ spin_unlock(&pag->pag_buf_lock);
+ } else {
+ xfs_buf_lru_del(bp);
+ ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
+ rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+ spin_unlock(&pag->pag_buf_lock);
+ xfs_perag_put(pag);
+ xfs_buf_free(bp);
+ }
+ }
+}
+
+
+/*
+ * Lock a buffer object, if it is not already locked.
+ *
+ * If we come across a stale, pinned, locked buffer, we know that we are
+ * being asked to lock a buffer that has been reallocated. Because it is
+ * pinned, we know that the log has not been pushed to disk and hence it
+ * will still be locked. Rather than continuing to have trylock attempts
+ * fail until someone else pushes the log, push it ourselves before
+ * returning. This means that the xfsaild will not get stuck trying
+ * to push on stale inode buffers.
+ */
+int
+xfs_buf_cond_lock(
+ xfs_buf_t *bp)
+{
+ int locked;
+
+ locked = down_trylock(&bp->b_sema) == 0;
+ if (locked)
+ XB_SET_OWNER(bp);
+ else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+ xfs_log_force(bp->b_target->bt_mount, 0);
+
+ trace_xfs_buf_cond_lock(bp, _RET_IP_);
+ return locked ? 0 : -EBUSY;
+}
+
+int
+xfs_buf_lock_value(
+ xfs_buf_t *bp)
+{
+ return bp->b_sema.count;
+}
+
+/*
+ * Lock a buffer object.
+ *
+ * If we come across a stale, pinned, locked buffer, we know that we
+ * are being asked to lock a buffer that has been reallocated. Because
+ * it is pinned, we know that the log has not been pushed to disk and
+ * hence it will still be locked. Rather than sleeping until someone
+ * else pushes the log, push it ourselves before trying to get the lock.
+ */
+void
+xfs_buf_lock(
+ xfs_buf_t *bp)
+{
+ trace_xfs_buf_lock(bp, _RET_IP_);
+
+ if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+ xfs_log_force(bp->b_target->bt_mount, 0);
+ down(&bp->b_sema);
+ XB_SET_OWNER(bp);
+
+ trace_xfs_buf_lock_done(bp, _RET_IP_);
+}
+
+/*
+ * Releases the lock on the buffer object.
+ * If the buffer is marked delwri but is not queued, do so before we
+ * unlock the buffer as we need to set flags correctly. We also need to
+ * take a reference for the delwri queue because the unlocker is going to
+ * drop their's and they don't know we just queued it.
+ */
+void
+xfs_buf_unlock(
+ xfs_buf_t *bp)
+{
+ if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
+ atomic_inc(&bp->b_hold);
+ bp->b_flags |= XBF_ASYNC;
+ xfs_buf_delwri_queue(bp, 0);
+ }
+
+ XB_CLEAR_OWNER(bp);
+ up(&bp->b_sema);
+
+ trace_xfs_buf_unlock(bp, _RET_IP_);
+}
+
+STATIC void
+xfs_buf_wait_unpin(
+ xfs_buf_t *bp)
+{
+ DECLARE_WAITQUEUE (wait, current);
+
+ if (atomic_read(&bp->b_pin_count) == 0)
+ return;
+
+ add_wait_queue(&bp->b_waiters, &wait);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&bp->b_pin_count) == 0)
+ break;
+ io_schedule();
+ }
+ remove_wait_queue(&bp->b_waiters, &wait);
+ set_current_state(TASK_RUNNING);
+}
+
+/*
+ * Buffer Utility Routines
+ */
+
+STATIC void
+xfs_buf_iodone_work(
+ struct work_struct *work)
+{
+ xfs_buf_t *bp =
+ container_of(work, xfs_buf_t, b_iodone_work);
+
+ if (bp->b_iodone)
+ (*(bp->b_iodone))(bp);
+ else if (bp->b_flags & XBF_ASYNC)
+ xfs_buf_relse(bp);
+}
+
+void
+xfs_buf_ioend(
+ xfs_buf_t *bp,
+ int schedule)
+{
+ trace_xfs_buf_iodone(bp, _RET_IP_);
+
+ bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+ if (bp->b_error == 0)
+ bp->b_flags |= XBF_DONE;
+
+ if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
+ if (schedule) {
+ INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
+ queue_work(xfslogd_workqueue, &bp->b_iodone_work);
+ } else {
+ xfs_buf_iodone_work(&bp->b_iodone_work);
+ }
+ } else {
+ complete(&bp->b_iowait);
+ }
+}
+
+void
+xfs_buf_ioerror(
+ xfs_buf_t *bp,
+ int error)
+{
+ ASSERT(error >= 0 && error <= 0xffff);
+ bp->b_error = (unsigned short)error;
+ trace_xfs_buf_ioerror(bp, error, _RET_IP_);
+}
+
+int
+xfs_bwrite(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ int error;
+
+ bp->b_flags |= XBF_WRITE;
+ bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
+
+ xfs_buf_delwri_dequeue(bp);
+ xfs_bdstrat_cb(bp);
+
+ error = xfs_buf_iowait(bp);
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ xfs_buf_relse(bp);
+ return error;
+}
+
+void
+xfs_bdwrite(
+ void *mp,
+ struct xfs_buf *bp)
+{
+ trace_xfs_buf_bdwrite(bp, _RET_IP_);
+
+ bp->b_flags &= ~XBF_READ;
+ bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
+
+ xfs_buf_delwri_queue(bp, 1);
+}
+
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
+ * so that the proper iodone callbacks get called.
+ */
+STATIC int
+xfs_bioerror(
+ xfs_buf_t *bp)
+{
+#ifdef XFSERRORDEBUG
+ ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+
+ /*
+ * No need to wait until the buffer is unpinned, we aren't flushing it.
+ */
+ XFS_BUF_ERROR(bp, EIO);
+
+ /*
+ * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
+ */
+ XFS_BUF_UNREAD(bp);
+ XFS_BUF_UNDELAYWRITE(bp);
+ XFS_BUF_UNDONE(bp);
+ XFS_BUF_STALE(bp);
+
+ xfs_buf_ioend(bp, 0);
+
+ return EIO;
+}
+
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the xfs_buf_ioend call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+STATIC int
+xfs_bioerror_relse(
+ struct xfs_buf *bp)
+{
+ int64_t fl = XFS_BUF_BFLAGS(bp);
+ /*
+ * No need to wait until the buffer is unpinned.
+ * We aren't flushing it.
+ *
+ * chunkhold expects B_DONE to be set, whether
+ * we actually finish the I/O or not. We don't want to
+ * change that interface.
+ */
+ XFS_BUF_UNREAD(bp);
+ XFS_BUF_UNDELAYWRITE(bp);
+ XFS_BUF_DONE(bp);
+ XFS_BUF_STALE(bp);
+ XFS_BUF_CLR_IODONE_FUNC(bp);
+ if (!(fl & XBF_ASYNC)) {
+ /*
+ * Mark b_error and B_ERROR _both_.
+ * Lot's of chunkcache code assumes that.
+ * There's no reason to mark error for
+ * ASYNC buffers.
+ */
+ XFS_BUF_ERROR(bp, EIO);
+ XFS_BUF_FINISH_IOWAIT(bp);
+ } else {
+ xfs_buf_relse(bp);
+ }
+
+ return EIO;
+}
+
+
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(
+ struct xfs_buf *bp)
+{
+ if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ trace_xfs_bdstrat_shut(bp, _RET_IP_);
+ /*
+ * Metadata write that didn't get logged but
+ * written delayed anyway. These aren't associated
+ * with a transaction, and can be ignored.
+ */
+ if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
+ return xfs_bioerror_relse(bp);
+ else
+ return xfs_bioerror(bp);
+ }
+
+ xfs_buf_iorequest(bp);
+ return 0;
+}
+
+/*
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem. Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
+ */
+void
+xfsbdstrat(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ trace_xfs_bdstrat_shut(bp, _RET_IP_);
+ xfs_bioerror_relse(bp);
+ return;
+ }
+
+ xfs_buf_iorequest(bp);
+}
+
+STATIC void
+_xfs_buf_ioend(
+ xfs_buf_t *bp,
+ int schedule)
+{
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+ xfs_buf_ioend(bp, schedule);
+}
+
+STATIC void
+xfs_buf_bio_end_io(
+ struct bio *bio,
+ int error)
+{
+ xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
+
+ xfs_buf_ioerror(bp, -error);
+
+ if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+ invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
+
+ _xfs_buf_ioend(bp, 1);
+ bio_put(bio);
+}
+
+STATIC void
+_xfs_buf_ioapply(
+ xfs_buf_t *bp)
+{
+ int rw, map_i, total_nr_pages, nr_pages;
+ struct bio *bio;
+ int offset = bp->b_offset;
+ int size = bp->b_count_desired;
+ sector_t sector = bp->b_bn;
+
+ total_nr_pages = bp->b_page_count;
+ map_i = 0;
+
+ if (bp->b_flags & XBF_ORDERED) {
+ ASSERT(!(bp->b_flags & XBF_READ));
+ rw = WRITE_FLUSH_FUA;
+ } else if (bp->b_flags & XBF_LOG_BUFFER) {
+ ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
+ bp->b_flags &= ~_XBF_RUN_QUEUES;
+ rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
+ } else if (bp->b_flags & _XBF_RUN_QUEUES) {
+ ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
+ bp->b_flags &= ~_XBF_RUN_QUEUES;
+ rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META;
+ } else {
+ rw = (bp->b_flags & XBF_WRITE) ? WRITE :
+ (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
+ }
+
+
+next_chunk:
+ atomic_inc(&bp->b_io_remaining);
+ nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
+ if (nr_pages > total_nr_pages)
+ nr_pages = total_nr_pages;
+
+ bio = bio_alloc(GFP_NOIO, nr_pages);
+ bio->bi_bdev = bp->b_target->bt_bdev;
+ bio->bi_sector = sector;
+ bio->bi_end_io = xfs_buf_bio_end_io;
+ bio->bi_private = bp;
+
+
+ for (; size && nr_pages; nr_pages--, map_i++) {
+ int rbytes, nbytes = PAGE_SIZE - offset;
+
+ if (nbytes > size)
+ nbytes = size;
+
+ rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
+ if (rbytes < nbytes)
+ break;
+
+ offset = 0;
+ sector += nbytes >> BBSHIFT;
+ size -= nbytes;
+ total_nr_pages--;
+ }
+
+ if (likely(bio->bi_size)) {
+ if (xfs_buf_is_vmapped(bp)) {
+ flush_kernel_vmap_range(bp->b_addr,
+ xfs_buf_vmap_len(bp));
+ }
+ submit_bio(rw, bio);
+ if (size)
+ goto next_chunk;
+ } else {
+ xfs_buf_ioerror(bp, EIO);
+ bio_put(bio);
+ }
+}
+
+int
+xfs_buf_iorequest(
+ xfs_buf_t *bp)
+{
+ trace_xfs_buf_iorequest(bp, _RET_IP_);
+
+ if (bp->b_flags & XBF_DELWRI) {
+ xfs_buf_delwri_queue(bp, 1);
+ return 0;
+ }
+
+ if (bp->b_flags & XBF_WRITE) {
+ xfs_buf_wait_unpin(bp);
+ }
+
+ xfs_buf_hold(bp);
+
+ /* Set the count to 1 initially, this will stop an I/O
+ * completion callout which happens before we have started
+ * all the I/O from calling xfs_buf_ioend too early.
+ */
+ atomic_set(&bp->b_io_remaining, 1);
+ _xfs_buf_ioapply(bp);
+ _xfs_buf_ioend(bp, 0);
+
+ xfs_buf_rele(bp);
+ return 0;
+}
+
+/*
+ * Waits for I/O to complete on the buffer supplied.
+ * It returns immediately if no I/O is pending.
+ * It returns the I/O error code, if any, or 0 if there was no error.
+ */
+int
+xfs_buf_iowait(
+ xfs_buf_t *bp)
+{
+ trace_xfs_buf_iowait(bp, _RET_IP_);
+
+ wait_for_completion(&bp->b_iowait);
+
+ trace_xfs_buf_iowait_done(bp, _RET_IP_);
+ return bp->b_error;
+}
+
+xfs_caddr_t
+xfs_buf_offset(
+ xfs_buf_t *bp,
+ size_t offset)
+{
+ struct page *page;
+
+ if (bp->b_flags & XBF_MAPPED)
+ return XFS_BUF_PTR(bp) + offset;
+
+ offset += bp->b_offset;
+ page = bp->b_pages[offset >> PAGE_SHIFT];
+ return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
+}
+
+/*
+ * Move data into or out of a buffer.
+ */
+void
+xfs_buf_iomove(
+ xfs_buf_t *bp, /* buffer to process */
+ size_t boff, /* starting buffer offset */
+ size_t bsize, /* length to copy */
+ void *data, /* data address */
+ xfs_buf_rw_t mode) /* read/write/zero flag */
+{
+ size_t bend, cpoff, csize;
+ struct page *page;
+
+ bend = boff + bsize;
+ while (boff < bend) {
+ page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
+ cpoff = xfs_buf_poff(boff + bp->b_offset);
+ csize = min_t(size_t,
+ PAGE_SIZE-cpoff, bp->b_count_desired-boff);
+
+ ASSERT(((csize + cpoff) <= PAGE_SIZE));
+
+ switch (mode) {
+ case XBRW_ZERO:
+ memset(page_address(page) + cpoff, 0, csize);
+ break;
+ case XBRW_READ:
+ memcpy(data, page_address(page) + cpoff, csize);
+ break;
+ case XBRW_WRITE:
+ memcpy(page_address(page) + cpoff, data, csize);
+ }
+
+ boff += csize;
+ data += csize;
+ }
+}
+
+/*
+ * Handling of buffer targets (buftargs).
+ */
+
+/*
+ * Wait for any bufs with callbacks that have been submitted but have not yet
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
+ */
+void
+xfs_wait_buftarg(
+ struct xfs_buftarg *btp)
+{
+ struct xfs_buf *bp;
+
+restart:
+ spin_lock(&btp->bt_lru_lock);
+ while (!list_empty(&btp->bt_lru)) {
+ bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+ if (atomic_read(&bp->b_hold) > 1) {
+ spin_unlock(&btp->bt_lru_lock);
+ delay(100);
+ goto restart;
+ }
+ /*
+ * clear the LRU reference count so the bufer doesn't get
+ * ignored in xfs_buf_rele().
+ */
+ atomic_set(&bp->b_lru_ref, 0);
+ spin_unlock(&btp->bt_lru_lock);
+ xfs_buf_rele(bp);
+ spin_lock(&btp->bt_lru_lock);
+ }
+ spin_unlock(&btp->bt_lru_lock);
+}
+
+int
+xfs_buftarg_shrink(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_buftarg *btp = container_of(shrink,
+ struct xfs_buftarg, bt_shrinker);
+ struct xfs_buf *bp;
+ int nr_to_scan = sc->nr_to_scan;
+ LIST_HEAD(dispose);
+
+ if (!nr_to_scan)
+ return btp->bt_lru_nr;
+
+ spin_lock(&btp->bt_lru_lock);
+ while (!list_empty(&btp->bt_lru)) {
+ if (nr_to_scan-- <= 0)
+ break;
+
+ bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+
+ /*
+ * Decrement the b_lru_ref count unless the value is already
+ * zero. If the value is already zero, we need to reclaim the
+ * buffer, otherwise it gets another trip through the LRU.
+ */
+ if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+ list_move_tail(&bp->b_lru, &btp->bt_lru);
+ continue;
+ }
+
+ /*
+ * remove the buffer from the LRU now to avoid needing another
+ * lock round trip inside xfs_buf_rele().
+ */
+ list_move(&bp->b_lru, &dispose);
+ btp->bt_lru_nr--;
+ }
+ spin_unlock(&btp->bt_lru_lock);
+
+ while (!list_empty(&dispose)) {
+ bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+ list_del_init(&bp->b_lru);
+ xfs_buf_rele(bp);
+ }
+
+ return btp->bt_lru_nr;
+}
+
+void
+xfs_free_buftarg(
+ struct xfs_mount *mp,
+ struct xfs_buftarg *btp)
+{
+ unregister_shrinker(&btp->bt_shrinker);
+
+ xfs_flush_buftarg(btp, 1);
+ if (mp->m_flags & XFS_MOUNT_BARRIER)
+ xfs_blkdev_issue_flush(btp);
+
+ kthread_stop(btp->bt_task);
+ kmem_free(btp);
+}
+
+STATIC int
+xfs_setsize_buftarg_flags(
+ xfs_buftarg_t *btp,
+ unsigned int blocksize,
+ unsigned int sectorsize,
+ int verbose)
+{
+ btp->bt_bsize = blocksize;
+ btp->bt_sshift = ffs(sectorsize) - 1;
+ btp->bt_smask = sectorsize - 1;
+
+ if (set_blocksize(btp->bt_bdev, sectorsize)) {
+ xfs_warn(btp->bt_mount,
+ "Cannot set_blocksize to %u on device %s\n",
+ sectorsize, XFS_BUFTARG_NAME(btp));
+ return EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * When allocating the initial buffer target we have not yet
+ * read in the superblock, so don't know what sized sectors
+ * are being used is at this early stage. Play safe.
+ */
+STATIC int
+xfs_setsize_buftarg_early(
+ xfs_buftarg_t *btp,
+ struct block_device *bdev)
+{
+ return xfs_setsize_buftarg_flags(btp,
+ PAGE_SIZE, bdev_logical_block_size(bdev), 0);
+}
+
+int
+xfs_setsize_buftarg(
+ xfs_buftarg_t *btp,
+ unsigned int blocksize,
+ unsigned int sectorsize)
+{
+ return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
+}
+
+STATIC int
+xfs_alloc_delwrite_queue(
+ xfs_buftarg_t *btp,
+ const char *fsname)
+{
+ INIT_LIST_HEAD(&btp->bt_delwrite_queue);
+ spin_lock_init(&btp->bt_delwrite_lock);
+ btp->bt_flags = 0;
+ btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
+ if (IS_ERR(btp->bt_task))
+ return PTR_ERR(btp->bt_task);
+ return 0;
+}
+
+xfs_buftarg_t *
+xfs_alloc_buftarg(
+ struct xfs_mount *mp,
+ struct block_device *bdev,
+ int external,
+ const char *fsname)
+{
+ xfs_buftarg_t *btp;
+
+ btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+
+ btp->bt_mount = mp;
+ btp->bt_dev = bdev->bd_dev;
+ btp->bt_bdev = bdev;
+ btp->bt_bdi = blk_get_backing_dev_info(bdev);
+ if (!btp->bt_bdi)
+ goto error;
+
+ INIT_LIST_HEAD(&btp->bt_lru);
+ spin_lock_init(&btp->bt_lru_lock);
+ if (xfs_setsize_buftarg_early(btp, bdev))
+ goto error;
+ if (xfs_alloc_delwrite_queue(btp, fsname))
+ goto error;
+ btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+ btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+ register_shrinker(&btp->bt_shrinker);
+ return btp;
+
+error:
+ kmem_free(btp);
+ return NULL;
+}
+
+
+/*
+ * Delayed write buffer handling
+ */
+STATIC void
+xfs_buf_delwri_queue(
+ xfs_buf_t *bp,
+ int unlock)
+{
+ struct list_head *dwq = &bp->b_target->bt_delwrite_queue;
+ spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
+
+ trace_xfs_buf_delwri_queue(bp, _RET_IP_);
+
+ ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
+
+ spin_lock(dwlk);
+ /* If already in the queue, dequeue and place at tail */
+ if (!list_empty(&bp->b_list)) {
+ ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+ if (unlock)
+ atomic_dec(&bp->b_hold);
+ list_del(&bp->b_list);
+ }
+
+ if (list_empty(dwq)) {
+ /* start xfsbufd as it is about to have something to do */
+ wake_up_process(bp->b_target->bt_task);
+ }
+
+ bp->b_flags |= _XBF_DELWRI_Q;
+ list_add_tail(&bp->b_list, dwq);
+ bp->b_queuetime = jiffies;
+ spin_unlock(dwlk);
+
+ if (unlock)
+ xfs_buf_unlock(bp);
+}
+
+void
+xfs_buf_delwri_dequeue(
+ xfs_buf_t *bp)
+{
+ spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
+ int dequeued = 0;
+
+ spin_lock(dwlk);
+ if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
+ ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+ list_del_init(&bp->b_list);
+ dequeued = 1;
+ }
+ bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
+ spin_unlock(dwlk);
+
+ if (dequeued)
+ xfs_buf_rele(bp);
+
+ trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
+}
+
+/*
+ * If a delwri buffer needs to be pushed before it has aged out, then promote
+ * it to the head of the delwri queue so that it will be flushed on the next
+ * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
+ * than the age currently needed to flush the buffer. Hence the next time the
+ * xfsbufd sees it is guaranteed to be considered old enough to flush.
+ */
+void
+xfs_buf_delwri_promote(
+ struct xfs_buf *bp)
+{
+ struct xfs_buftarg *btp = bp->b_target;
+ long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
+
+ ASSERT(bp->b_flags & XBF_DELWRI);
+ ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+
+ /*
+ * Check the buffer age before locking the delayed write queue as we
+ * don't need to promote buffers that are already past the flush age.
+ */
+ if (bp->b_queuetime < jiffies - age)
+ return;
+ bp->b_queuetime = jiffies - age;
+ spin_lock(&btp->bt_delwrite_lock);
+ list_move(&bp->b_list, &btp->bt_delwrite_queue);
+ spin_unlock(&btp->bt_delwrite_lock);
+}
+
+STATIC void
+xfs_buf_runall_queues(
+ struct workqueue_struct *queue)
+{
+ flush_workqueue(queue);
+}
+
+/*
+ * Move as many buffers as specified to the supplied list
+ * idicating if we skipped any buffers to prevent deadlocks.
+ */
+STATIC int
+xfs_buf_delwri_split(
+ xfs_buftarg_t *target,
+ struct list_head *list,
+ unsigned long age)
+{
+ xfs_buf_t *bp, *n;
+ struct list_head *dwq = &target->bt_delwrite_queue;
+ spinlock_t *dwlk = &target->bt_delwrite_lock;
+ int skipped = 0;
+ int force;
+
+ force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
+ INIT_LIST_HEAD(list);
+ spin_lock(dwlk);
+ list_for_each_entry_safe(bp, n, dwq, b_list) {
+ ASSERT(bp->b_flags & XBF_DELWRI);
+
+ if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
+ if (!force &&
+ time_before(jiffies, bp->b_queuetime + age)) {
+ xfs_buf_unlock(bp);
+ break;
+ }
+
+ bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|
+ _XBF_RUN_QUEUES);
+ bp->b_flags |= XBF_WRITE;
+ list_move_tail(&bp->b_list, list);
+ trace_xfs_buf_delwri_split(bp, _RET_IP_);
+ } else
+ skipped++;
+ }
+ spin_unlock(dwlk);
+
+ return skipped;
+
+}
+
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+static int
+xfs_buf_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
+ struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
+ xfs_daddr_t diff;
+
+ diff = ap->b_bn - bp->b_bn;
+ if (diff < 0)
+ return -1;
+ if (diff > 0)
+ return 1;
+ return 0;
+}
+
+void
+xfs_buf_delwri_sort(
+ xfs_buftarg_t *target,
+ struct list_head *list)
+{
+ list_sort(NULL, list, xfs_buf_cmp);
+}
+
+STATIC int
+xfsbufd(
+ void *data)
+{
+ xfs_buftarg_t *target = (xfs_buftarg_t *)data;
+
+ current->flags |= PF_MEMALLOC;
+
+ set_freezable();
+
+ do {
+ long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
+ long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
+ struct list_head tmp;
+ struct blk_plug plug;
+
+ if (unlikely(freezing(current))) {
+ set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
+ refrigerator();
+ } else {
+ clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
+ }
+
+ /* sleep for a long time if there is nothing to do. */
+ if (list_empty(&target->bt_delwrite_queue))
+ tout = MAX_SCHEDULE_TIMEOUT;
+ schedule_timeout_interruptible(tout);
+
+ xfs_buf_delwri_split(target, &tmp, age);
+ list_sort(NULL, &tmp, xfs_buf_cmp);
+
+ blk_start_plug(&plug);
+ while (!list_empty(&tmp)) {
+ struct xfs_buf *bp;
+ bp = list_first_entry(&tmp, struct xfs_buf, b_list);
+ list_del_init(&bp->b_list);
+ xfs_bdstrat_cb(bp);
+ }
+ blk_finish_plug(&plug);
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+/*
+ * Go through all incore buffers, and release buffers if they belong to
+ * the given device. This is used in filesystem error handling to
+ * preserve the consistency of its metadata.
+ */
+int
+xfs_flush_buftarg(
+ xfs_buftarg_t *target,
+ int wait)
+{
+ xfs_buf_t *bp;
+ int pincount = 0;
+ LIST_HEAD(tmp_list);
+ LIST_HEAD(wait_list);
+ struct blk_plug plug;
+
+ xfs_buf_runall_queues(xfsconvertd_workqueue);
+ xfs_buf_runall_queues(xfsdatad_workqueue);
+ xfs_buf_runall_queues(xfslogd_workqueue);
+
+ set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
+ pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
+
+ /*
+ * Dropped the delayed write list lock, now walk the temporary list.
+ * All I/O is issued async and then if we need to wait for completion
+ * we do that after issuing all the IO.
+ */
+ list_sort(NULL, &tmp_list, xfs_buf_cmp);
+
+ blk_start_plug(&plug);
+ while (!list_empty(&tmp_list)) {
+ bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
+ ASSERT(target == bp->b_target);
+ list_del_init(&bp->b_list);
+ if (wait) {
+ bp->b_flags &= ~XBF_ASYNC;
+ list_add(&bp->b_list, &wait_list);
+ }
+ xfs_bdstrat_cb(bp);
+ }
+ blk_finish_plug(&plug);
+
+ if (wait) {
+ /* Wait for IO to complete. */
+ while (!list_empty(&wait_list)) {
+ bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
+
+ list_del_init(&bp->b_list);
+ xfs_buf_iowait(bp);
+ xfs_buf_relse(bp);
+ }
+ }
+
+ return pincount;
+}
+
+int __init
+xfs_buf_init(void)
+{
+ xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
+ KM_ZONE_HWALIGN, NULL);
+ if (!xfs_buf_zone)
+ goto out;
+
+ xfslogd_workqueue = alloc_workqueue("xfslogd",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
+ if (!xfslogd_workqueue)
+ goto out_free_buf_zone;
+
+ xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
+ if (!xfsdatad_workqueue)
+ goto out_destroy_xfslogd_workqueue;
+
+ xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
+ WQ_MEM_RECLAIM, 1);
+ if (!xfsconvertd_workqueue)
+ goto out_destroy_xfsdatad_workqueue;
+
+ return 0;
+
+ out_destroy_xfsdatad_workqueue:
+ destroy_workqueue(xfsdatad_workqueue);
+ out_destroy_xfslogd_workqueue:
+ destroy_workqueue(xfslogd_workqueue);
+ out_free_buf_zone:
+ kmem_zone_destroy(xfs_buf_zone);
+ out:
+ return -ENOMEM;
+}
+
+void
+xfs_buf_terminate(void)
+{
+ destroy_workqueue(xfsconvertd_workqueue);
+ destroy_workqueue(xfsdatad_workqueue);
+ destroy_workqueue(xfslogd_workqueue);
+ kmem_zone_destroy(xfs_buf_zone);
+}
+
+#ifdef CONFIG_KDB_MODULES
+struct list_head *
+xfs_get_buftarg_list(void)
+{
+ return &xfs_buftarg_list;
+}
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
new file mode 100644
index 00000000..36d6ee44
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_BUF_H__
+#define __XFS_BUF_H__
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <asm/system.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/uio.h>
+
+/*
+ * Base types
+ */
+
+#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
+
+#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE)
+#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
+#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT)
+#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK)
+
+typedef enum {
+ XBRW_READ = 1, /* transfer into target memory */
+ XBRW_WRITE = 2, /* transfer from target memory */
+ XBRW_ZERO = 3, /* Zero target memory */
+} xfs_buf_rw_t;
+
+#define XBF_READ (1 << 0) /* buffer intended for reading from device */
+#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
+#define XBF_MAPPED (1 << 2) /* buffer mapped (b_addr valid) */
+#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
+#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
+#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */
+#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
+#define XBF_ORDERED (1 << 11)/* use ordered writes */
+#define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */
+#define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */
+
+/* flags used only as arguments to access routines */
+#define XBF_LOCK (1 << 14)/* lock requested */
+#define XBF_TRYLOCK (1 << 15)/* lock requested, but do not wait */
+#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */
+
+/* flags used only internally */
+#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */
+#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
+#define _XBF_KMEM (1 << 20)/* backed by heap memory */
+#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */
+
+typedef unsigned int xfs_buf_flags_t;
+
+#define XFS_BUF_FLAGS \
+ { XBF_READ, "READ" }, \
+ { XBF_WRITE, "WRITE" }, \
+ { XBF_MAPPED, "MAPPED" }, \
+ { XBF_ASYNC, "ASYNC" }, \
+ { XBF_DONE, "DONE" }, \
+ { XBF_DELWRI, "DELWRI" }, \
+ { XBF_STALE, "STALE" }, \
+ { XBF_ORDERED, "ORDERED" }, \
+ { XBF_READ_AHEAD, "READ_AHEAD" }, \
+ { XBF_LOCK, "LOCK" }, /* should never be set */\
+ { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\
+ { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\
+ { _XBF_PAGES, "PAGES" }, \
+ { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
+ { _XBF_KMEM, "KMEM" }, \
+ { _XBF_DELWRI_Q, "DELWRI_Q" }
+
+typedef enum {
+ XBT_FORCE_SLEEP = 0,
+ XBT_FORCE_FLUSH = 1,
+} xfs_buftarg_flags_t;
+
+typedef struct xfs_bufhash {
+ struct list_head bh_list;
+ spinlock_t bh_lock;
+} xfs_bufhash_t;
+
+typedef struct xfs_buftarg {
+ dev_t bt_dev;
+ struct block_device *bt_bdev;
+ struct backing_dev_info *bt_bdi;
+ struct xfs_mount *bt_mount;
+ unsigned int bt_bsize;
+ unsigned int bt_sshift;
+ size_t bt_smask;
+
+ /* per device delwri queue */
+ struct task_struct *bt_task;
+ struct list_head bt_delwrite_queue;
+ spinlock_t bt_delwrite_lock;
+ unsigned long bt_flags;
+
+ /* LRU control structures */
+ struct shrinker bt_shrinker;
+ struct list_head bt_lru;
+ spinlock_t bt_lru_lock;
+ unsigned int bt_lru_nr;
+} xfs_buftarg_t;
+
+struct xfs_buf;
+typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
+
+#define XB_PAGES 2
+
+typedef struct xfs_buf {
+ /*
+ * first cacheline holds all the fields needed for an uncontended cache
+ * hit to be fully processed. The semaphore straddles the cacheline
+ * boundary, but the counter and lock sits on the first cacheline,
+ * which is the only bit that is touched if we hit the semaphore
+ * fast-path on locking.
+ */
+ struct rb_node b_rbnode; /* rbtree node */
+ xfs_off_t b_file_offset; /* offset in file */
+ size_t b_buffer_length;/* size of buffer in bytes */
+ atomic_t b_hold; /* reference count */
+ atomic_t b_lru_ref; /* lru reclaim ref count */
+ xfs_buf_flags_t b_flags; /* status flags */
+ struct semaphore b_sema; /* semaphore for lockables */
+
+ struct list_head b_lru; /* lru list */
+ wait_queue_head_t b_waiters; /* unpin waiters */
+ struct list_head b_list;
+ struct xfs_perag *b_pag; /* contains rbtree root */
+ xfs_buftarg_t *b_target; /* buffer target (device) */
+ xfs_daddr_t b_bn; /* block number for I/O */
+ size_t b_count_desired;/* desired transfer size */
+ void *b_addr; /* virtual address of buffer */
+ struct work_struct b_iodone_work;
+ xfs_buf_iodone_t b_iodone; /* I/O completion function */
+ struct completion b_iowait; /* queue for I/O waiters */
+ void *b_fspriv;
+ void *b_fspriv2;
+ struct page **b_pages; /* array of page pointers */
+ struct page *b_page_array[XB_PAGES]; /* inline pages */
+ unsigned long b_queuetime; /* time buffer was queued */
+ atomic_t b_pin_count; /* pin count */
+ atomic_t b_io_remaining; /* #outstanding I/O requests */
+ unsigned int b_page_count; /* size of page array */
+ unsigned int b_offset; /* page offset in first page */
+ unsigned short b_error; /* error code on I/O */
+#ifdef XFS_BUF_LOCK_TRACKING
+ int b_last_holder;
+#endif
+} xfs_buf_t;
+
+
+/* Finding and Reading Buffers */
+extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
+ xfs_buf_flags_t, xfs_buf_t *);
+#define xfs_incore(buftarg,blkno,len,lockit) \
+ _xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
+
+extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
+ xfs_buf_flags_t);
+extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
+ xfs_buf_flags_t);
+
+extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
+extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
+extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
+extern void xfs_buf_hold(xfs_buf_t *);
+extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
+ struct xfs_buftarg *target,
+ xfs_daddr_t daddr, size_t length, int flags);
+
+/* Releasing Buffers */
+extern void xfs_buf_free(xfs_buf_t *);
+extern void xfs_buf_rele(xfs_buf_t *);
+
+/* Locking and Unlocking Buffers */
+extern int xfs_buf_cond_lock(xfs_buf_t *);
+extern int xfs_buf_lock_value(xfs_buf_t *);
+extern void xfs_buf_lock(xfs_buf_t *);
+extern void xfs_buf_unlock(xfs_buf_t *);
+
+/* Buffer Read and Write Routines */
+extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
+extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
+
+extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+extern int xfs_bdstrat_cb(struct xfs_buf *);
+
+extern void xfs_buf_ioend(xfs_buf_t *, int);
+extern void xfs_buf_ioerror(xfs_buf_t *, int);
+extern int xfs_buf_iorequest(xfs_buf_t *);
+extern int xfs_buf_iowait(xfs_buf_t *);
+extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
+ xfs_buf_rw_t);
+#define xfs_buf_zero(bp, off, len) \
+ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
+
+static inline int xfs_buf_geterror(xfs_buf_t *bp)
+{
+ return bp ? bp->b_error : ENOMEM;
+}
+
+/* Buffer Utility Routines */
+extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
+
+/* Delayed Write Buffer Routines */
+extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
+extern void xfs_buf_delwri_promote(xfs_buf_t *);
+
+/* Buffer Daemon Setup Routines */
+extern int xfs_buf_init(void);
+extern void xfs_buf_terminate(void);
+
+#define xfs_buf_target_name(target) \
+ ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
+
+
+#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
+#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
+ ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
+
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
+#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
+#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
+#define XFS_BUF_SUPER_STALE(bp) do { \
+ XFS_BUF_STALE(bp); \
+ xfs_buf_delwri_dequeue(bp); \
+ XFS_BUF_DONE(bp); \
+ } while (0)
+
+#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
+#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
+#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
+
+#define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no)
+#define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp)
+#define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0)
+
+#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
+#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
+#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
+
+#define XFS_BUF_BUSY(bp) do { } while (0)
+#define XFS_BUF_UNBUSY(bp) do { } while (0)
+#define XFS_BUF_ISBUSY(bp) (1)
+
+#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC)
+#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC)
+#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC)
+
+#define XFS_BUF_ORDERED(bp) ((bp)->b_flags |= XBF_ORDERED)
+#define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED)
+#define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED)
+
+#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp)
+#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
+#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
+#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ)
+
+#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE)
+#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE)
+#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE)
+
+#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone)
+#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func))
+#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL)
+
+#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv)
+#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val))
+#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
+#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
+#define XFS_BUF_SET_START(bp) do { } while (0)
+
+#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
+#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
+#define XFS_BUF_ADDR(bp) ((bp)->b_bn)
+#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno))
+#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset)
+#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off))
+#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired)
+#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt))
+#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
+#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
+
+static inline void
+xfs_buf_set_ref(
+ struct xfs_buf *bp,
+ int lru_ref)
+{
+ atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
+#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
+
+#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
+
+#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp)
+#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0)
+#define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp)
+#define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp)
+#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait);
+
+#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target))
+#define XFS_BUF_TARGET(bp) ((bp)->b_target)
+#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target)
+
+static inline void xfs_buf_relse(xfs_buf_t *bp)
+{
+ xfs_buf_unlock(bp);
+ xfs_buf_rele(bp);
+}
+
+/*
+ * Handling of buftargs.
+ */
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+ struct block_device *, int, const char *);
+extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
+extern void xfs_wait_buftarg(xfs_buftarg_t *);
+extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
+extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
+
+#ifdef CONFIG_KDB_MODULES
+extern struct list_head *xfs_get_buftarg_list(void);
+#endif
+
+#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
+#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
+
+#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1)
+
+#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 00000000..572494fa
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+
+STATIC int
+xfs_trim_extents(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_fsblock_t start,
+ xfs_fsblock_t len,
+ xfs_fsblock_t minlen,
+ __uint64_t *blocks_trimmed)
+{
+ struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ struct xfs_perag *pag;
+ int error;
+ int i;
+
+ pag = xfs_perag_get(mp, agno);
+
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+ if (error || !agbp)
+ goto out_put_perag;
+
+ cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+
+ /*
+ * Force out the log. This means any transactions that might have freed
+ * space before we took the AGF buffer lock are now on disk, and the
+ * volatile disk cache is flushed.
+ */
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /*
+ * Look up the longest btree in the AGF and start with it.
+ */
+ error = xfs_alloc_lookup_le(cur, 0,
+ be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
+ if (error)
+ goto out_del_cursor;
+
+ /*
+ * Loop until we are done with all extents that are large
+ * enough to be worth discarding.
+ */
+ while (i) {
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+
+ error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+ if (error)
+ goto out_del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+ ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
+
+ /*
+ * Too small? Give up.
+ */
+ if (flen < minlen) {
+ trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+ goto out_del_cursor;
+ }
+
+ /*
+ * If the extent is entirely outside of the range we are
+ * supposed to discard skip it. Do not bother to trim
+ * down partially overlapping ranges for now.
+ */
+ if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+ XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+ trace_xfs_discard_exclude(mp, agno, fbno, flen);
+ goto next_extent;
+ }
+
+ /*
+ * If any blocks in the range are still busy, skip the
+ * discard and try again the next time.
+ */
+ if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+ trace_xfs_discard_busy(mp, agno, fbno, flen);
+ goto next_extent;
+ }
+
+ trace_xfs_discard_extent(mp, agno, fbno, flen);
+ error = -blkdev_issue_discard(bdev,
+ XFS_AGB_TO_DADDR(mp, agno, fbno),
+ XFS_FSB_TO_BB(mp, flen),
+ GFP_NOFS, 0);
+ if (error)
+ goto out_del_cursor;
+ *blocks_trimmed += flen;
+
+next_extent:
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ goto out_del_cursor;
+ }
+
+out_del_cursor:
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ xfs_buf_relse(agbp);
+out_put_perag:
+ xfs_perag_put(pag);
+ return error;
+}
+
+int
+xfs_ioc_trim(
+ struct xfs_mount *mp,
+ struct fstrim_range __user *urange)
+{
+ struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+ unsigned int granularity = q->limits.discard_granularity;
+ struct fstrim_range range;
+ xfs_fsblock_t start, len, minlen;
+ xfs_agnumber_t start_agno, end_agno, agno;
+ __uint64_t blocks_trimmed = 0;
+ int error, last_error = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+ if (!blk_queue_discard(q))
+ return -XFS_ERROR(EOPNOTSUPP);
+ if (copy_from_user(&range, urange, sizeof(range)))
+ return -XFS_ERROR(EFAULT);
+
+ /*
+ * Truncating down the len isn't actually quite correct, but using
+ * XFS_B_TO_FSB would mean we trivially get overflows for values
+ * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
+ * used by the fstrim application. In the end it really doesn't
+ * matter as trimming blocks is an advisory interface.
+ */
+ start = XFS_B_TO_FSBT(mp, range.start);
+ len = XFS_B_TO_FSBT(mp, range.len);
+ minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+
+ start_agno = XFS_FSB_TO_AGNO(mp, start);
+ if (start_agno >= mp->m_sb.sb_agcount)
+ return -XFS_ERROR(EINVAL);
+
+ end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+ if (end_agno >= mp->m_sb.sb_agcount)
+ end_agno = mp->m_sb.sb_agcount - 1;
+
+ for (agno = start_agno; agno <= end_agno; agno++) {
+ error = -xfs_trim_extents(mp, agno, start, len, minlen,
+ &blocks_trimmed);
+ if (error)
+ last_error = error;
+ }
+
+ if (last_error)
+ return last_error;
+
+ range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+ if (copy_to_user(urange, &range, sizeof(range)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+int
+xfs_discard_extents(
+ struct xfs_mount *mp,
+ struct list_head *list)
+{
+ struct xfs_busy_extent *busyp;
+ int error = 0;
+
+ list_for_each_entry(busyp, list, list) {
+ trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+ busyp->length);
+
+ error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+ XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+ XFS_FSB_TO_BB(mp, busyp->length),
+ GFP_NOFS, 0);
+ if (error && error != EOPNOTSUPP) {
+ xfs_info(mp,
+ "discard failed for extent [0x%llu,%u], error %d",
+ (unsigned long long)busyp->bno,
+ busyp->length,
+ error);
+ return error;
+ }
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 00000000..344879ae
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,10 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+
+struct fstrim_range;
+struct list_head;
+
+extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+extern int xfs_discard_extents(struct xfs_mount *, struct list_head *);
+
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
new file mode 100644
index 00000000..fed3f3c8
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_export.h"
+#include "xfs_vnodeops.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+
+/*
+ * Note that we only accept fileids which are long enough rather than allow
+ * the parent generation number to default to zero. XFS considers zero a
+ * valid generation number not an invalid/wildcard value.
+ */
+static int xfs_fileid_length(int fileid_type)
+{
+ switch (fileid_type) {
+ case FILEID_INO32_GEN:
+ return 2;
+ case FILEID_INO32_GEN_PARENT:
+ return 4;
+ case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+ return 3;
+ case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+ return 6;
+ }
+ return 255; /* invalid */
+}
+
+STATIC int
+xfs_fs_encode_fh(
+ struct dentry *dentry,
+ __u32 *fh,
+ int *max_len,
+ int connectable)
+{
+ struct fid *fid = (struct fid *)fh;
+ struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh;
+ struct inode *inode = dentry->d_inode;
+ int fileid_type;
+ int len;
+
+ /* Directories don't need their parent encoded, they have ".." */
+ if (S_ISDIR(inode->i_mode) || !connectable)
+ fileid_type = FILEID_INO32_GEN;
+ else
+ fileid_type = FILEID_INO32_GEN_PARENT;
+
+ /*
+ * If the the filesystem may contain 64bit inode numbers, we need
+ * to use larger file handles that can represent them.
+ *
+ * While we only allocate inodes that do not fit into 32 bits any
+ * large enough filesystem may contain them, thus the slightly
+ * confusing looking conditional below.
+ */
+ if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+ (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
+ fileid_type |= XFS_FILEID_TYPE_64FLAG;
+
+ /*
+ * Only encode if there is enough space given. In practice
+ * this means we can't export a filesystem with 64bit inodes
+ * over NFSv2 with the subtree_check export option; the other
+ * seven combinations work. The real answer is "don't use v2".
+ */
+ len = xfs_fileid_length(fileid_type);
+ if (*max_len < len) {
+ *max_len = len;
+ return 255;
+ }
+ *max_len = len;
+
+ switch (fileid_type) {
+ case FILEID_INO32_GEN_PARENT:
+ spin_lock(&dentry->d_lock);
+ fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
+ fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
+ spin_unlock(&dentry->d_lock);
+ /*FALLTHRU*/
+ case FILEID_INO32_GEN:
+ fid->i32.ino = XFS_I(inode)->i_ino;
+ fid->i32.gen = inode->i_generation;
+ break;
+ case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+ spin_lock(&dentry->d_lock);
+ fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
+ fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
+ spin_unlock(&dentry->d_lock);
+ /*FALLTHRU*/
+ case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+ fid64->ino = XFS_I(inode)->i_ino;
+ fid64->gen = inode->i_generation;
+ break;
+ }
+
+ return fileid_type;
+}
+
+STATIC struct inode *
+xfs_nfs_get_inode(
+ struct super_block *sb,
+ u64 ino,
+ u32 generation)
+ {
+ xfs_mount_t *mp = XFS_M(sb);
+ xfs_inode_t *ip;
+ int error;
+
+ /*
+ * NFS can sometimes send requests for ino 0. Fail them gracefully.
+ */
+ if (ino == 0)
+ return ERR_PTR(-ESTALE);
+
+ /*
+ * The XFS_IGET_UNTRUSTED means that an invalid inode number is just
+ * fine and not an indication of a corrupted filesystem as clients can
+ * send invalid file handles and we have to handle it gracefully..
+ */
+ error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
+ if (error) {
+ /*
+ * EINVAL means the inode cluster doesn't exist anymore.
+ * This implies the filehandle is stale, so we should
+ * translate it here.
+ * We don't use ESTALE directly down the chain to not
+ * confuse applications using bulkstat that expect EINVAL.
+ */
+ if (error == EINVAL)
+ error = ESTALE;
+ return ERR_PTR(-error);
+ }
+
+ if (ip->i_d.di_gen != generation) {
+ IRELE(ip);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return VFS_I(ip);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fileid_type)
+{
+ struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid;
+ struct inode *inode = NULL;
+
+ if (fh_len < xfs_fileid_length(fileid_type))
+ return NULL;
+
+ switch (fileid_type) {
+ case FILEID_INO32_GEN_PARENT:
+ case FILEID_INO32_GEN:
+ inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen);
+ break;
+ case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+ case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+ inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen);
+ break;
+ }
+
+ return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fileid_type)
+{
+ struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid;
+ struct inode *inode = NULL;
+
+ switch (fileid_type) {
+ case FILEID_INO32_GEN_PARENT:
+ inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,
+ fid->i32.parent_gen);
+ break;
+ case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+ inode = xfs_nfs_get_inode(sb, fid64->parent_ino,
+ fid64->parent_gen);
+ break;
+ }
+
+ return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_get_parent(
+ struct dentry *child)
+{
+ int error;
+ struct xfs_inode *cip;
+
+ error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
+ if (unlikely(error))
+ return ERR_PTR(-error);
+
+ return d_obtain_alias(VFS_I(cip));
+}
+
+STATIC int
+xfs_fs_nfs_commit_metadata(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ int error = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip)) {
+ error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
+ XFS_LOG_SYNC, NULL);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ return error;
+}
+
+const struct export_operations xfs_export_operations = {
+ .encode_fh = xfs_fs_encode_fh,
+ .fh_to_dentry = xfs_fs_fh_to_dentry,
+ .fh_to_parent = xfs_fs_fh_to_parent,
+ .get_parent = xfs_fs_get_parent,
+ .commit_metadata = xfs_fs_nfs_commit_metadata,
+};
diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/linux-2.6/xfs_export.h
new file mode 100644
index 00000000..3272b6ae
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_export.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_EXPORT_H__
+#define __XFS_EXPORT_H__
+
+/*
+ * Common defines for code related to exporting XFS filesystems over NFS.
+ *
+ * The NFS fileid goes out on the wire as an array of
+ * 32bit unsigned ints in host order. There are 5 possible
+ * formats.
+ *
+ * (1) fileid_type=0x00
+ * (no fileid data; handled by the generic code)
+ *
+ * (2) fileid_type=0x01
+ * inode-num
+ * generation
+ *
+ * (3) fileid_type=0x02
+ * inode-num
+ * generation
+ * parent-inode-num
+ * parent-generation
+ *
+ * (4) fileid_type=0x81
+ * inode-num-lo32
+ * inode-num-hi32
+ * generation
+ *
+ * (5) fileid_type=0x82
+ * inode-num-lo32
+ * inode-num-hi32
+ * generation
+ * parent-inode-num-lo32
+ * parent-inode-num-hi32
+ * parent-generation
+ *
+ * Note, the NFS filehandle also includes an fsid portion which
+ * may have an inode number in it. That number is hardcoded to
+ * 32bits and there is no way for XFS to intercept it. In
+ * practice this means when exporting an XFS filesystem with 64bit
+ * inodes you should either export the mountpoint (rather than
+ * a subdirectory) or use the "fsid" export option.
+ */
+
+struct xfs_fid64 {
+ u64 ino;
+ u32 gen;
+ u64 parent_ino;
+ u32 parent_gen;
+} __attribute__((packed));
+
+/* This flag goes on the wire. Don't play with it. */
+#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */
+
+#endif /* __XFS_EXPORT_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
new file mode 100644
index 00000000..b679198d
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -0,0 +1,1114 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_trans.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_vnodeops.h"
+#include "xfs_da_btree.h"
+#include "xfs_ioctl.h"
+#include "xfs_trace.h"
+
+#include <linux/dcache.h>
+#include <linux/falloc.h>
+
+static const struct vm_operations_struct xfs_file_vm_ops;
+
+/*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+ struct xfs_inode *ip,
+ int type)
+{
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_lock(&VFS_I(ip)->i_mutex);
+ xfs_ilock(ip, type);
+}
+
+static inline void
+xfs_rw_iunlock(
+ struct xfs_inode *ip,
+ int type)
+{
+ xfs_iunlock(ip, type);
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+static inline void
+xfs_rw_ilock_demote(
+ struct xfs_inode *ip,
+ int type)
+{
+ xfs_ilock_demote(ip, type);
+ if (type & XFS_IOLOCK_EXCL)
+ mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+/*
+ * xfs_iozero
+ *
+ * xfs_iozero clears the specified range of buffer supplied,
+ * and marks all the affected blocks as valid and modified. If
+ * an affected block is not allocated, it will be allocated. If
+ * an affected block is not completely overwritten, and is not
+ * valid before the operation, it will be read from disk before
+ * being partially zeroed.
+ */
+STATIC int
+xfs_iozero(
+ struct xfs_inode *ip, /* inode */
+ loff_t pos, /* offset in file */
+ size_t count) /* size of data to zero */
+{
+ struct page *page;
+ struct address_space *mapping;
+ int status;
+
+ mapping = VFS_I(ip)->i_mapping;
+ do {
+ unsigned offset, bytes;
+ void *fsdata;
+
+ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+ bytes = PAGE_CACHE_SIZE - offset;
+ if (bytes > count)
+ bytes = count;
+
+ status = pagecache_write_begin(NULL, mapping, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE,
+ &page, &fsdata);
+ if (status)
+ break;
+
+ zero_user(page, offset, bytes);
+
+ status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+ page, fsdata);
+ WARN_ON(status <= 0); /* can't return less than zero! */
+ pos += bytes;
+ count -= bytes;
+ status = 0;
+ } while (count);
+
+ return (-status);
+}
+
+STATIC int
+xfs_file_fsync(
+ struct file *file,
+ int datasync)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error = 0;
+ int log_flushed = 0;
+
+ trace_xfs_file_fsync(ip);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+
+ xfs_ioend_wait(ip);
+
+ if (mp->m_flags & XFS_MOUNT_BARRIER) {
+ /*
+ * If we have an RT and/or log subvolume we need to make sure
+ * to flush the write cache the device used for file data
+ * first. This is to ensure newly written file data make
+ * it to disk before logging the new inode size in case of
+ * an extending write.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+ else if (mp->m_logdev_targp != mp->m_ddev_targp)
+ xfs_blkdev_issue_flush(mp->m_ddev_targp);
+ }
+
+ /*
+ * We always need to make sure that the required inode state is safe on
+ * disk. The inode might be clean but we still might need to force the
+ * log because of committed transactions that haven't hit the disk yet.
+ * Likewise, there could be unflushed non-transactional changes to the
+ * inode core that have to go to disk and this requires us to issue
+ * a synchronous transaction to capture these changes correctly.
+ *
+ * This code relies on the assumption that if the i_update_core field
+ * of the inode is clear and the inode is unpinned then it is clean
+ * and no action is required.
+ */
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+ /*
+ * First check if the VFS inode is marked dirty. All the dirtying
+ * of non-transactional updates no goes through mark_inode_dirty*,
+ * which allows us to distinguish beteeen pure timestamp updates
+ * and i_size updates which need to be caught for fdatasync.
+ * After that also theck for the dirty state in the XFS inode, which
+ * might gets cleared when the inode gets written out via the AIL
+ * or xfs_iflush_cluster.
+ */
+ if (((inode->i_state & I_DIRTY_DATASYNC) ||
+ ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+ ip->i_update_core) {
+ /*
+ * Kick off a transaction to log the inode core to get the
+ * updates. The sync transaction will also force the log.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, 0,
+ XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return -error;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ /*
+ * Note - it's possible that we might have pushed ourselves out
+ * of the way during trans_reserve which would flush the inode.
+ * But there's no guarantee that the inode buffer has actually
+ * gone out yet (it's delwri). Plus the buffer could be pinned
+ * anyway if it's part of an inode in another recent
+ * transaction. So we play it safe and fire off the
+ * transaction anyway.
+ */
+ xfs_trans_ijoin(tp, ip);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ xfs_trans_set_sync(tp);
+ error = _xfs_trans_commit(tp, 0, &log_flushed);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ } else {
+ /*
+ * Timestamps/size haven't changed since last inode flush or
+ * inode transaction commit. That means either nothing got
+ * written or a transaction committed which caught the updates.
+ * If the latter happened and the transaction hasn't hit the
+ * disk yet, the inode will be still be pinned. If it is,
+ * force the log.
+ */
+ if (xfs_ipincount(ip)) {
+ error = _xfs_log_force_lsn(mp,
+ ip->i_itemp->ili_last_lsn,
+ XFS_LOG_SYNC, &log_flushed);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ }
+
+ /*
+ * If we only have a single device, and the log force about was
+ * a no-op we might have to flush the data device cache here.
+ * This can only happen for fdatasync/O_DSYNC if we were overwriting
+ * an already allocated file and thus do not have any metadata to
+ * commit.
+ */
+ if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
+ mp->m_logdev_targp == mp->m_ddev_targp &&
+ !XFS_IS_REALTIME_INODE(ip) &&
+ !log_flushed)
+ xfs_blkdev_issue_flush(mp->m_ddev_targp);
+
+ return -error;
+}
+
+STATIC ssize_t
+xfs_file_aio_read(
+ struct kiocb *iocb,
+ const struct iovec *iovp,
+ unsigned long nr_segs,
+ loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ size_t size = 0;
+ ssize_t ret = 0;
+ int ioflags = 0;
+ xfs_fsize_t n;
+ unsigned long seg;
+
+ XFS_STATS_INC(xs_read_calls);
+
+ BUG_ON(iocb->ki_pos != pos);
+
+ if (unlikely(file->f_flags & O_DIRECT))
+ ioflags |= IO_ISDIRECT;
+ if (file->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
+
+ /* START copy & waste from filemap.c */
+ for (seg = 0; seg < nr_segs; seg++) {
+ const struct iovec *iv = &iovp[seg];
+
+ /*
+ * If any segment has a negative length, or the cumulative
+ * length ever wraps negative then return -EINVAL.
+ */
+ size += iv->iov_len;
+ if (unlikely((ssize_t)(size|iv->iov_len) < 0))
+ return XFS_ERROR(-EINVAL);
+ }
+ /* END copy & waste from filemap.c */
+
+ if (unlikely(ioflags & IO_ISDIRECT)) {
+ xfs_buftarg_t *target =
+ XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+ if ((iocb->ki_pos & target->bt_smask) ||
+ (size & target->bt_smask)) {
+ if (iocb->ki_pos == ip->i_size)
+ return 0;
+ return -XFS_ERROR(EINVAL);
+ }
+ }
+
+ n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
+ if (n <= 0 || size == 0)
+ return 0;
+
+ if (n < size)
+ size = n;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ /*
+ * Locking is a bit tricky here. If we take an exclusive lock
+ * for direct IO, we effectively serialise all new concurrent
+ * read IO to this file and block it behind IO that is currently in
+ * progress because IO in progress holds the IO lock shared. We only
+ * need to hold the lock exclusive to blow away the page cache, so
+ * only take lock exclusively if the page cache needs invalidation.
+ * This allows the normal direct IO case of no page cache pages to
+ * proceeed concurrently without serialisation.
+ */
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+
+ if (inode->i_mapping->nrpages) {
+ ret = -xfs_flushinval_pages(ip,
+ (iocb->ki_pos & PAGE_CACHE_MASK),
+ -1, FI_REMAPF_LOCKED);
+ if (ret) {
+ xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+ return ret;
+ }
+ }
+ xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ }
+
+ trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+
+ ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+ if (ret > 0)
+ XFS_STATS_ADD(xs_read_bytes, ret);
+
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_splice_read(
+ struct file *infilp,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t count,
+ unsigned int flags)
+{
+ struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
+ int ioflags = 0;
+ ssize_t ret;
+
+ XFS_STATS_INC(xs_read_calls);
+
+ if (infilp->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
+
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+
+ trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+ if (ret > 0)
+ XFS_STATS_ADD(xs_read_bytes, ret);
+
+ xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ return ret;
+}
+
+STATIC void
+xfs_aio_write_isize_update(
+ struct inode *inode,
+ loff_t *ppos,
+ ssize_t bytes_written)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ xfs_fsize_t isize = i_size_read(inode);
+
+ if (bytes_written > 0)
+ XFS_STATS_ADD(xs_write_bytes, bytes_written);
+
+ if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+ *ppos > isize))
+ *ppos = isize;
+
+ if (*ppos > ip->i_size) {
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+ if (*ppos > ip->i_size)
+ ip->i_size = *ppos;
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+}
+
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occurred. In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+ struct xfs_inode *ip)
+{
+ if (ip->i_new_size) {
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+ ip->i_new_size = 0;
+ if (ip->i_d.di_size > ip->i_size)
+ ip->i_d.di_size = ip->i_size;
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+}
+
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
+STATIC ssize_t
+xfs_file_splice_write(
+ struct pipe_inode_info *pipe,
+ struct file *outfilp,
+ loff_t *ppos,
+ size_t count,
+ unsigned int flags)
+{
+ struct inode *inode = outfilp->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ xfs_fsize_t new_size;
+ int ioflags = 0;
+ ssize_t ret;
+
+ XFS_STATS_INC(xs_write_calls);
+
+ if (outfilp->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
+
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ new_size = *ppos + count;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (new_size > ip->i_size)
+ ip->i_new_size = new_size;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
+
+ ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+
+ xfs_aio_write_isize_update(inode, ppos, ret);
+ xfs_aio_write_newsize_update(ip);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return ret;
+}
+
+/*
+ * This routine is called to handle zeroing any space in the last
+ * block of the file that is beyond the EOF. We do this since the
+ * size is being increased without writing anything to that block
+ * and we don't want anyone to read the garbage on the disk.
+ */
+STATIC int /* error (positive) */
+xfs_zero_last_block(
+ xfs_inode_t *ip,
+ xfs_fsize_t offset,
+ xfs_fsize_t isize)
+{
+ xfs_fileoff_t last_fsb;
+ xfs_mount_t *mp = ip->i_mount;
+ int nimaps;
+ int zero_offset;
+ int zero_len;
+ int error = 0;
+ xfs_bmbt_irec_t imap;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+ if (zero_offset == 0) {
+ /*
+ * There are no extra bytes in the last block on disk to
+ * zero, so return.
+ */
+ return 0;
+ }
+
+ last_fsb = XFS_B_TO_FSBT(mp, isize);
+ nimaps = 1;
+ error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
+ &nimaps, NULL);
+ if (error) {
+ return error;
+ }
+ ASSERT(nimaps > 0);
+ /*
+ * If the block underlying isize is just a hole, then there
+ * is nothing to zero.
+ */
+ if (imap.br_startblock == HOLESTARTBLOCK) {
+ return 0;
+ }
+ /*
+ * Zero the part of the last block beyond the EOF, and write it
+ * out sync. We need to drop the ilock while we do this so we
+ * don't deadlock when the buffer cache calls back to us.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ zero_len = mp->m_sb.sb_blocksize - zero_offset;
+ if (isize + zero_len > offset)
+ zero_len = offset - isize;
+ error = xfs_iozero(ip, isize, zero_len);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ ASSERT(error >= 0);
+ return error;
+}
+
+/*
+ * Zero any on disk space between the current EOF and the new,
+ * larger EOF. This handles the normal case of zeroing the remainder
+ * of the last block in the file and the unusual case of zeroing blocks
+ * out beyond the size of the file. This second case only happens
+ * with fixed size extents and when the system crashes before the inode
+ * size was updated but after blocks were allocated. If fill is set,
+ * then any holes in the range are filled and zeroed. If not, the holes
+ * are left alone as holes.
+ */
+
+int /* error (positive) */
+xfs_zero_eof(
+ xfs_inode_t *ip,
+ xfs_off_t offset, /* starting I/O offset */
+ xfs_fsize_t isize) /* current inode size */
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_fileoff_t start_zero_fsb;
+ xfs_fileoff_t end_zero_fsb;
+ xfs_fileoff_t zero_count_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_fileoff_t zero_off;
+ xfs_fsize_t zero_len;
+ int nimaps;
+ int error = 0;
+ xfs_bmbt_irec_t imap;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ ASSERT(offset > isize);
+
+ /*
+ * First handle zeroing the block on which isize resides.
+ * We only zero a part of that block so it is handled specially.
+ */
+ error = xfs_zero_last_block(ip, offset, isize);
+ if (error) {
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ return error;
+ }
+
+ /*
+ * Calculate the range between the new size and the old
+ * where blocks needing to be zeroed may exist. To get the
+ * block where the last byte in the file currently resides,
+ * we need to subtract one from the size and truncate back
+ * to a block boundary. We subtract 1 in case the size is
+ * exactly on a block boundary.
+ */
+ last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+ start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+ end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+ ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+ if (last_fsb == end_zero_fsb) {
+ /*
+ * The size was only incremented on its last block.
+ * We took care of that above, so just return.
+ */
+ return 0;
+ }
+
+ ASSERT(start_zero_fsb <= end_zero_fsb);
+ while (start_zero_fsb <= end_zero_fsb) {
+ nimaps = 1;
+ zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+ error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
+ 0, NULL, 0, &imap, &nimaps, NULL);
+ if (error) {
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ return error;
+ }
+ ASSERT(nimaps > 0);
+
+ if (imap.br_state == XFS_EXT_UNWRITTEN ||
+ imap.br_startblock == HOLESTARTBLOCK) {
+ /*
+ * This loop handles initializing pages that were
+ * partially initialized by the code below this
+ * loop. It basically zeroes the part of the page
+ * that sits on a hole and sets the page as P_HOLE
+ * and calls remapf if it is a mapped file.
+ */
+ start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+ ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+ continue;
+ }
+
+ /*
+ * There are blocks we need to zero.
+ * Drop the inode lock while we're doing the I/O.
+ * We'll still have the iolock to protect us.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
+ zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
+
+ if ((zero_off + zero_len) > offset)
+ zero_len = offset - zero_off;
+
+ error = xfs_iozero(ip, zero_off, zero_len);
+ if (error) {
+ goto out_lock;
+ }
+
+ start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+ ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ }
+
+ return 0;
+
+out_lock:
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ ASSERT(error >= 0);
+ return error;
+}
+
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
+STATIC ssize_t
+xfs_file_aio_write_checks(
+ struct file *file,
+ loff_t *pos,
+ size_t *count,
+ int *iolock)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ xfs_fsize_t new_size;
+ int error = 0;
+
+ xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+ error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+ if (error) {
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+ *iolock = 0;
+ return error;
+ }
+
+ new_size = *pos + *count;
+ if (new_size > ip->i_size)
+ ip->i_new_size = new_size;
+
+ if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+ file_update_time(file);
+
+ /*
+ * If the offset is beyond the size of the file, we need to zero any
+ * blocks that fall between the existing EOF and the start of this
+ * write.
+ */
+ if (*pos > ip->i_size)
+ error = -xfs_zero_eof(ip, *pos, ip->i_size);
+
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ /*
+ * If we're writing the file then make sure to clear the setuid and
+ * setgid bits if the process is not being run by root. This keeps
+ * people from modifying setuid and setgid binaries.
+ */
+ return file_remove_suid(file);
+
+}
+
+/*
+ * xfs_file_dio_aio_write - handle direct IO writes
+ *
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
+ * By separating it from the buffered write path we remove all the tricky to
+ * follow locking changes and looping.
+ *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer. To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+ struct kiocb *iocb,
+ const struct iovec *iovp,
+ unsigned long nr_segs,
+ loff_t pos,
+ size_t ocount,
+ int *iolock)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ ssize_t ret = 0;
+ size_t count = ocount;
+ int unaligned_io = 0;
+ struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+
+ *iolock = 0;
+ if ((pos & target->bt_smask) || (count & target->bt_smask))
+ return -XFS_ERROR(EINVAL);
+
+ if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+ unaligned_io = 1;
+
+ if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+ *iolock = XFS_IOLOCK_EXCL;
+ else
+ *iolock = XFS_IOLOCK_SHARED;
+ xfs_rw_ilock(ip, *iolock);
+
+ ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+ if (ret)
+ return ret;
+
+ /*
+ * Recheck if there are cached pages that need invalidate after we got
+ * the iolock to protect against other threads adding new pages while
+ * we were waiting for the iolock.
+ */
+ if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+ }
+
+ if (mapping->nrpages) {
+ ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+ FI_REMAPF_LOCKED);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * If we are doing unaligned IO, wait for all other IO to drain,
+ * otherwise demote the lock if we had to flush cached pages
+ */
+ if (unaligned_io)
+ xfs_ioend_wait(ip);
+ else if (*iolock == XFS_IOLOCK_EXCL) {
+ xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ *iolock = XFS_IOLOCK_SHARED;
+ }
+
+ trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+ ret = generic_file_direct_write(iocb, iovp,
+ &nr_segs, pos, &iocb->ki_pos, count, ocount);
+
+ /* No fallback to buffered IO on errors for XFS. */
+ ASSERT(ret < 0 || ret == count);
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_write(
+ struct kiocb *iocb,
+ const struct iovec *iovp,
+ unsigned long nr_segs,
+ loff_t pos,
+ size_t ocount,
+ int *iolock)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ ssize_t ret;
+ int enospc = 0;
+ size_t count = ocount;
+
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+
+ ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+ if (ret)
+ return ret;
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+
+write_retry:
+ trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+ ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+ pos, &iocb->ki_pos, count, ret);
+ /*
+ * if we just got an ENOSPC, flush the inode now we aren't holding any
+ * page locks and retry *once*
+ */
+ if (ret == -ENOSPC && !enospc) {
+ ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+ if (ret)
+ return ret;
+ enospc = 1;
+ goto write_retry;
+ }
+ current->backing_dev_info = NULL;
+ return ret;
+}
+
+STATIC ssize_t
+xfs_file_aio_write(
+ struct kiocb *iocb,
+ const struct iovec *iovp,
+ unsigned long nr_segs,
+ loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ ssize_t ret;
+ int iolock;
+ size_t ocount = 0;
+
+ XFS_STATS_INC(xs_write_calls);
+
+ BUG_ON(iocb->ki_pos != pos);
+
+ ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+ if (ret)
+ return ret;
+
+ if (ocount == 0)
+ return 0;
+
+ xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
+
+ if (unlikely(file->f_flags & O_DIRECT))
+ ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+ ocount, &iolock);
+ else
+ ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+ ocount, &iolock);
+
+ xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
+
+ if (ret <= 0)
+ goto out_unlock;
+
+ /* Handle various SYNC-type writes */
+ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+ loff_t end = pos + ret - 1;
+ int error, error2;
+
+ xfs_rw_iunlock(ip, iolock);
+ error = filemap_write_and_wait_range(mapping, pos, end);
+ xfs_rw_ilock(ip, iolock);
+
+ error2 = -xfs_file_fsync(file,
+ (file->f_flags & __O_SYNC) ? 0 : 1);
+ if (error)
+ ret = error;
+ else if (error2)
+ ret = error2;
+ }
+
+out_unlock:
+ xfs_aio_write_newsize_update(ip);
+ xfs_rw_iunlock(ip, iolock);
+ return ret;
+}
+
+STATIC long
+xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ long error;
+ loff_t new_size = 0;
+ xfs_flock64_t bf;
+ xfs_inode_t *ip = XFS_I(inode);
+ int cmd = XFS_IOC_RESVSP;
+ int attr_flags = XFS_ATTR_NOLOCK;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ bf.l_whence = 0;
+ bf.l_start = offset;
+ bf.l_len = len;
+
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ cmd = XFS_IOC_UNRESVSP;
+
+ /* check the new inode size is valid before allocating */
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ error = inode_newsize_ok(inode, new_size);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (file->f_flags & O_DSYNC)
+ attr_flags |= XFS_ATTR_SYNC;
+
+ error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
+ if (error)
+ goto out_unlock;
+
+ /* Change file size if needed */
+ if (new_size) {
+ struct iattr iattr;
+
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = new_size;
+ error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
+ }
+
+out_unlock:
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+
+STATIC int
+xfs_file_open(
+ struct inode *inode,
+ struct file *file)
+{
+ if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+ return -EFBIG;
+ if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+ return -EIO;
+ return 0;
+}
+
+STATIC int
+xfs_dir_open(
+ struct inode *inode,
+ struct file *file)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ int mode;
+ int error;
+
+ error = xfs_file_open(inode, file);
+ if (error)
+ return error;
+
+ /*
+ * If there are any blocks, read-ahead block 0 as we're almost
+ * certain to have the next operation be a read there.
+ */
+ mode = xfs_ilock_map_shared(ip);
+ if (ip->i_d.di_nextents > 0)
+ xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+ xfs_iunlock(ip, mode);
+ return 0;
+}
+
+STATIC int
+xfs_file_release(
+ struct inode *inode,
+ struct file *filp)
+{
+ return -xfs_release(XFS_I(inode));
+}
+
+STATIC int
+xfs_file_readdir(
+ struct file *filp,
+ void *dirent,
+ filldir_t filldir)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ xfs_inode_t *ip = XFS_I(inode);
+ int error;
+ size_t bufsize;
+
+ /*
+ * The Linux API doesn't pass down the total size of the buffer
+ * we read into down to the filesystem. With the filldir concept
+ * it's not needed for correct information, but the XFS dir2 leaf
+ * code wants an estimate of the buffer size to calculate it's
+ * readahead window and size the buffers used for mapping to
+ * physical blocks.
+ *
+ * Try to give it an estimate that's good enough, maybe at some
+ * point we can change the ->readdir prototype to include the
+ * buffer size. For now we use the current glibc buffer size.
+ */
+ bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
+
+ error = xfs_readdir(ip, dirent, bufsize,
+ (xfs_off_t *)&filp->f_pos, filldir);
+ if (error)
+ return -error;
+ return 0;
+}
+
+STATIC int
+xfs_file_mmap(
+ struct file *filp,
+ struct vm_area_struct *vma)
+{
+ vma->vm_ops = &xfs_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+
+ file_accessed(filp);
+ return 0;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made
+ * writable. We can set the page state up correctly for a writable
+ * page, which means we can do correct delalloc accounting (ENOSPC
+ * checking!) and unwritten extent mapping.
+ */
+STATIC int
+xfs_vm_page_mkwrite(
+ struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ return block_page_mkwrite(vma, vmf, xfs_get_blocks);
+}
+
+const struct file_operations xfs_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = xfs_file_aio_read,
+ .aio_write = xfs_file_aio_write,
+ .splice_read = xfs_file_splice_read,
+ .splice_write = xfs_file_splice_write,
+ .unlocked_ioctl = xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = xfs_file_compat_ioctl,
+#endif
+ .mmap = xfs_file_mmap,
+ .open = xfs_file_open,
+ .release = xfs_file_release,
+ .fsync = xfs_file_fsync,
+ .fallocate = xfs_file_fallocate,
+};
+
+const struct file_operations xfs_dir_file_operations = {
+ .open = xfs_dir_open,
+ .read = generic_read_dir,
+ .readdir = xfs_file_readdir,
+ .llseek = generic_file_llseek,
+ .unlocked_ioctl = xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = xfs_file_compat_ioctl,
+#endif
+ .fsync = xfs_file_fsync,
+};
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = xfs_vm_page_mkwrite,
+};
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
new file mode 100644
index 00000000..ed88ed16
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_vnodeops.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+
+/*
+ * note: all filemap functions return negative error codes. These
+ * need to be inverted before returning to the xfs core functions.
+ */
+void
+xfs_tosspages(
+ xfs_inode_t *ip,
+ xfs_off_t first,
+ xfs_off_t last,
+ int fiopt)
+{
+ /* can't toss partial tail pages, so mask them out */
+ last &= ~(PAGE_SIZE - 1);
+ truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
+}
+
+int
+xfs_flushinval_pages(
+ xfs_inode_t *ip,
+ xfs_off_t first,
+ xfs_off_t last,
+ int fiopt)
+{
+ struct address_space *mapping = VFS_I(ip)->i_mapping;
+ int ret = 0;
+
+ trace_xfs_pagecache_inval(ip, first, last);
+
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ ret = filemap_write_and_wait_range(mapping, first,
+ last == -1 ? LLONG_MAX : last);
+ if (!ret)
+ truncate_inode_pages_range(mapping, first, last);
+ return -ret;
+}
+
+int
+xfs_flush_pages(
+ xfs_inode_t *ip,
+ xfs_off_t first,
+ xfs_off_t last,
+ uint64_t flags,
+ int fiopt)
+{
+ struct address_space *mapping = VFS_I(ip)->i_mapping;
+ int ret = 0;
+ int ret2;
+
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ ret = -filemap_fdatawrite_range(mapping, first,
+ last == -1 ? LLONG_MAX : last);
+ if (flags & XBF_ASYNC)
+ return ret;
+ ret2 = xfs_wait_on_pages(ip, first, last);
+ if (!ret)
+ ret = ret2;
+ return ret;
+}
+
+int
+xfs_wait_on_pages(
+ xfs_inode_t *ip,
+ xfs_off_t first,
+ xfs_off_t last)
+{
+ struct address_space *mapping = VFS_I(ip)->i_mapping;
+
+ if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+ return -filemap_fdatawait_range(mapping, first,
+ last == -1 ? ip->i_size - 1 : last);
+ }
+ return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
new file mode 100644
index 00000000..76e81cff
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_sysctl.h"
+
+/*
+ * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
+ * other XFS code uses these values. Times are measured in centisecs (i.e.
+ * 100ths of a second).
+ */
+xfs_param_t xfs_params = {
+ /* MIN DFLT MAX */
+ .sgid_inherit = { 0, 0, 1 },
+ .symlink_mode = { 0, 0, 1 },
+ .panic_mask = { 0, 0, 255 },
+ .error_level = { 0, 3, 11 },
+ .syncd_timer = { 1*100, 30*100, 7200*100},
+ .stats_clear = { 0, 0, 1 },
+ .inherit_sync = { 0, 1, 1 },
+ .inherit_nodump = { 0, 1, 1 },
+ .inherit_noatim = { 0, 1, 1 },
+ .xfs_buf_timer = { 100/2, 1*100, 30*100 },
+ .xfs_buf_age = { 1*100, 15*100, 7200*100},
+ .inherit_nosym = { 0, 0, 1 },
+ .rotorstep = { 1, 1, 255 },
+ .inherit_nodfrg = { 0, 1, 1 },
+ .fstrm_timer = { 1, 30*100, 3600*100},
+};
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
new file mode 100644
index 00000000..acca2c5c
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -0,0 +1,1556 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_ioctl.h"
+#include "xfs_rtalloc.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_dfrag.h"
+#include "xfs_fsops.h"
+#include "xfs_vnodeops.h"
+#include "xfs_discard.h"
+#include "xfs_quota.h"
+#include "xfs_inode_item.h"
+#include "xfs_export.h"
+#include "xfs_trace.h"
+
+#include <linux/capability.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/exportfs.h>
+
+/*
+ * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
+ * a file or fs handle.
+ *
+ * XFS_IOC_PATH_TO_FSHANDLE
+ * returns fs handle for a mount point or path within that mount point
+ * XFS_IOC_FD_TO_HANDLE
+ * returns full handle for a FD opened in user space
+ * XFS_IOC_PATH_TO_HANDLE
+ * returns full handle for a path
+ */
+int
+xfs_find_handle(
+ unsigned int cmd,
+ xfs_fsop_handlereq_t *hreq)
+{
+ int hsize;
+ xfs_handle_t handle;
+ struct inode *inode;
+ struct file *file = NULL;
+ struct path path;
+ int error;
+ struct xfs_inode *ip;
+
+ if (cmd == XFS_IOC_FD_TO_HANDLE) {
+ file = fget(hreq->fd);
+ if (!file)
+ return -EBADF;
+ inode = file->f_path.dentry->d_inode;
+ } else {
+ error = user_lpath((const char __user *)hreq->path, &path);
+ if (error)
+ return error;
+ inode = path.dentry->d_inode;
+ }
+ ip = XFS_I(inode);
+
+ /*
+ * We can only generate handles for inodes residing on a XFS filesystem,
+ * and only for regular files, directories or symbolic links.
+ */
+ error = -EINVAL;
+ if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+ goto out_put;
+
+ error = -EBADF;
+ if (!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
+ goto out_put;
+
+
+ memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+
+ if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
+ /*
+ * This handle only contains an fsid, zero the rest.
+ */
+ memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
+ hsize = sizeof(xfs_fsid_t);
+ } else {
+ int lock_mode;
+
+ lock_mode = xfs_ilock_map_shared(ip);
+ handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
+ sizeof(handle.ha_fid.fid_len);
+ handle.ha_fid.fid_pad = 0;
+ handle.ha_fid.fid_gen = ip->i_d.di_gen;
+ handle.ha_fid.fid_ino = ip->i_ino;
+ xfs_iunlock_map_shared(ip, lock_mode);
+
+ hsize = XFS_HSIZE(handle);
+ }
+
+ error = -EFAULT;
+ if (copy_to_user(hreq->ohandle, &handle, hsize) ||
+ copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
+ goto out_put;
+
+ error = 0;
+
+ out_put:
+ if (cmd == XFS_IOC_FD_TO_HANDLE)
+ fput(file);
+ else
+ path_put(&path);
+ return error;
+}
+
+/*
+ * No need to do permission checks on the various pathname components
+ * as the handle operations are privileged.
+ */
+STATIC int
+xfs_handle_acceptable(
+ void *context,
+ struct dentry *dentry)
+{
+ return 1;
+}
+
+/*
+ * Convert userspace handle data into a dentry.
+ */
+struct dentry *
+xfs_handle_to_dentry(
+ struct file *parfilp,
+ void __user *uhandle,
+ u32 hlen)
+{
+ xfs_handle_t handle;
+ struct xfs_fid64 fid;
+
+ /*
+ * Only allow handle opens under a directory.
+ */
+ if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
+ return ERR_PTR(-ENOTDIR);
+
+ if (hlen != sizeof(xfs_handle_t))
+ return ERR_PTR(-EINVAL);
+ if (copy_from_user(&handle, uhandle, hlen))
+ return ERR_PTR(-EFAULT);
+ if (handle.ha_fid.fid_len !=
+ sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
+ return ERR_PTR(-EINVAL);
+
+ memset(&fid, 0, sizeof(struct fid));
+ fid.ino = handle.ha_fid.fid_ino;
+ fid.gen = handle.ha_fid.fid_gen;
+
+ return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
+ FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
+ xfs_handle_acceptable, NULL);
+}
+
+STATIC struct dentry *
+xfs_handlereq_to_dentry(
+ struct file *parfilp,
+ xfs_fsop_handlereq_t *hreq)
+{
+ return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
+}
+
+int
+xfs_open_by_handle(
+ struct file *parfilp,
+ xfs_fsop_handlereq_t *hreq)
+{
+ const struct cred *cred = current_cred();
+ int error;
+ int fd;
+ int permflag;
+ struct file *filp;
+ struct inode *inode;
+ struct dentry *dentry;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+
+ dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ inode = dentry->d_inode;
+
+ /* Restrict xfs_open_by_handle to directories & regular files. */
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
+ error = -XFS_ERROR(EPERM);
+ goto out_dput;
+ }
+
+#if BITS_PER_LONG != 32
+ hreq->oflags |= O_LARGEFILE;
+#endif
+
+ /* Put open permission in namei format. */
+ permflag = hreq->oflags;
+ if ((permflag+1) & O_ACCMODE)
+ permflag++;
+ if (permflag & O_TRUNC)
+ permflag |= 2;
+
+ if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
+ (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
+ error = -XFS_ERROR(EPERM);
+ goto out_dput;
+ }
+
+ if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
+ error = -XFS_ERROR(EACCES);
+ goto out_dput;
+ }
+
+ /* Can't write directories. */
+ if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+ error = -XFS_ERROR(EISDIR);
+ goto out_dput;
+ }
+
+ fd = get_unused_fd();
+ if (fd < 0) {
+ error = fd;
+ goto out_dput;
+ }
+
+ filp = dentry_open(dentry, mntget(parfilp->f_path.mnt),
+ hreq->oflags, cred);
+ if (IS_ERR(filp)) {
+ put_unused_fd(fd);
+ return PTR_ERR(filp);
+ }
+
+ if (inode->i_mode & S_IFREG) {
+ filp->f_flags |= O_NOATIME;
+ filp->f_mode |= FMODE_NOCMTIME;
+ }
+
+ fd_install(fd, filp);
+ return fd;
+
+ out_dput:
+ dput(dentry);
+ return error;
+}
+
+/*
+ * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
+ * unused first argument.
+ */
+STATIC int
+do_readlink(
+ char __user *buffer,
+ int buflen,
+ const char *link)
+{
+ int len;
+
+ len = PTR_ERR(link);
+ if (IS_ERR(link))
+ goto out;
+
+ len = strlen(link);
+ if (len > (unsigned) buflen)
+ len = buflen;
+ if (copy_to_user(buffer, link, len))
+ len = -EFAULT;
+ out:
+ return len;
+}
+
+
+int
+xfs_readlink_by_handle(
+ struct file *parfilp,
+ xfs_fsop_handlereq_t *hreq)
+{
+ struct dentry *dentry;
+ __u32 olen;
+ void *link;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+
+ dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ /* Restrict this handle operation to symlinks only. */
+ if (!S_ISLNK(dentry->d_inode->i_mode)) {
+ error = -XFS_ERROR(EINVAL);
+ goto out_dput;
+ }
+
+ if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
+ error = -XFS_ERROR(EFAULT);
+ goto out_dput;
+ }
+
+ link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+ if (!link) {
+ error = -XFS_ERROR(ENOMEM);
+ goto out_dput;
+ }
+
+ error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+ if (error)
+ goto out_kfree;
+ error = do_readlink(hreq->ohandle, olen, link);
+ if (error)
+ goto out_kfree;
+
+ out_kfree:
+ kfree(link);
+ out_dput:
+ dput(dentry);
+ return error;
+}
+
+STATIC int
+xfs_fssetdm_by_handle(
+ struct file *parfilp,
+ void __user *arg)
+{
+ int error;
+ struct fsdmidata fsd;
+ xfs_fsop_setdm_handlereq_t dmhreq;
+ struct dentry *dentry;
+
+ if (!capable(CAP_MKNOD))
+ return -XFS_ERROR(EPERM);
+ if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+
+ dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
+ error = -XFS_ERROR(EPERM);
+ goto out;
+ }
+
+ if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
+ error = -XFS_ERROR(EFAULT);
+ goto out;
+ }
+
+ error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+ fsd.fsd_dmstate);
+
+ out:
+ dput(dentry);
+ return error;
+}
+
+STATIC int
+xfs_attrlist_by_handle(
+ struct file *parfilp,
+ void __user *arg)
+{
+ int error = -ENOMEM;
+ attrlist_cursor_kern_t *cursor;
+ xfs_fsop_attrlist_handlereq_t al_hreq;
+ struct dentry *dentry;
+ char *kbuf;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+ if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+ if (al_hreq.buflen > XATTR_LIST_MAX)
+ return -XFS_ERROR(EINVAL);
+
+ /*
+ * Reject flags, only allow namespaces.
+ */
+ if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+ return -XFS_ERROR(EINVAL);
+
+ dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
+ if (!kbuf)
+ goto out_dput;
+
+ cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+ error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+ al_hreq.flags, cursor);
+ if (error)
+ goto out_kfree;
+
+ if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
+ error = -EFAULT;
+
+ out_kfree:
+ kfree(kbuf);
+ out_dput:
+ dput(dentry);
+ return error;
+}
+
+int
+xfs_attrmulti_attr_get(
+ struct inode *inode,
+ unsigned char *name,
+ unsigned char __user *ubuf,
+ __uint32_t *len,
+ __uint32_t flags)
+{
+ unsigned char *kbuf;
+ int error = EFAULT;
+
+ if (*len > XATTR_SIZE_MAX)
+ return EINVAL;
+ kbuf = kmalloc(*len, GFP_KERNEL);
+ if (!kbuf)
+ return ENOMEM;
+
+ error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
+ if (error)
+ goto out_kfree;
+
+ if (copy_to_user(ubuf, kbuf, *len))
+ error = EFAULT;
+
+ out_kfree:
+ kfree(kbuf);
+ return error;
+}
+
+int
+xfs_attrmulti_attr_set(
+ struct inode *inode,
+ unsigned char *name,
+ const unsigned char __user *ubuf,
+ __uint32_t len,
+ __uint32_t flags)
+{
+ unsigned char *kbuf;
+ int error = EFAULT;
+
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ return EPERM;
+ if (len > XATTR_SIZE_MAX)
+ return EINVAL;
+
+ kbuf = memdup_user(ubuf, len);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+
+ return error;
+}
+
+int
+xfs_attrmulti_attr_remove(
+ struct inode *inode,
+ unsigned char *name,
+ __uint32_t flags)
+{
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ return EPERM;
+ return xfs_attr_remove(XFS_I(inode), name, flags);
+}
+
+STATIC int
+xfs_attrmulti_by_handle(
+ struct file *parfilp,
+ void __user *arg)
+{
+ int error;
+ xfs_attr_multiop_t *ops;
+ xfs_fsop_attrmulti_handlereq_t am_hreq;
+ struct dentry *dentry;
+ unsigned int i, size;
+ unsigned char *attr_name;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+ if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+
+ /* overflow check */
+ if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+ return -E2BIG;
+
+ dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ error = E2BIG;
+ size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
+ if (!size || size > 16 * PAGE_SIZE)
+ goto out_dput;
+
+ ops = memdup_user(am_hreq.ops, size);
+ if (IS_ERR(ops)) {
+ error = PTR_ERR(ops);
+ goto out_dput;
+ }
+
+ attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+ if (!attr_name)
+ goto out_kfree_ops;
+
+ error = 0;
+ for (i = 0; i < am_hreq.opcount; i++) {
+ ops[i].am_error = strncpy_from_user((char *)attr_name,
+ ops[i].am_attrname, MAXNAMELEN);
+ if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+ error = -ERANGE;
+ if (ops[i].am_error < 0)
+ break;
+
+ switch (ops[i].am_opcode) {
+ case ATTR_OP_GET:
+ ops[i].am_error = xfs_attrmulti_attr_get(
+ dentry->d_inode, attr_name,
+ ops[i].am_attrvalue, &ops[i].am_length,
+ ops[i].am_flags);
+ break;
+ case ATTR_OP_SET:
+ ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ if (ops[i].am_error)
+ break;
+ ops[i].am_error = xfs_attrmulti_attr_set(
+ dentry->d_inode, attr_name,
+ ops[i].am_attrvalue, ops[i].am_length,
+ ops[i].am_flags);
+ mnt_drop_write(parfilp->f_path.mnt);
+ break;
+ case ATTR_OP_REMOVE:
+ ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ if (ops[i].am_error)
+ break;
+ ops[i].am_error = xfs_attrmulti_attr_remove(
+ dentry->d_inode, attr_name,
+ ops[i].am_flags);
+ mnt_drop_write(parfilp->f_path.mnt);
+ break;
+ default:
+ ops[i].am_error = EINVAL;
+ }
+ }
+
+ if (copy_to_user(am_hreq.ops, ops, size))
+ error = XFS_ERROR(EFAULT);
+
+ kfree(attr_name);
+ out_kfree_ops:
+ kfree(ops);
+ out_dput:
+ dput(dentry);
+ return -error;
+}
+
+int
+xfs_ioc_space(
+ struct xfs_inode *ip,
+ struct inode *inode,
+ struct file *filp,
+ int ioflags,
+ unsigned int cmd,
+ xfs_flock64_t *bf)
+{
+ int attr_flags = 0;
+ int error;
+
+ /*
+ * Only allow the sys admin to reserve space unless
+ * unwritten extents are enabled.
+ */
+ if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
+ !capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+
+ if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
+ return -XFS_ERROR(EPERM);
+
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -XFS_ERROR(EBADF);
+
+ if (!S_ISREG(inode->i_mode))
+ return -XFS_ERROR(EINVAL);
+
+ if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+ attr_flags |= XFS_ATTR_NONBLOCK;
+
+ if (filp->f_flags & O_DSYNC)
+ attr_flags |= XFS_ATTR_SYNC;
+
+ if (ioflags & IO_INVIS)
+ attr_flags |= XFS_ATTR_DMI;
+
+ error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
+ return -error;
+}
+
+STATIC int
+xfs_ioc_bulkstat(
+ xfs_mount_t *mp,
+ unsigned int cmd,
+ void __user *arg)
+{
+ xfs_fsop_bulkreq_t bulkreq;
+ int count; /* # of records returned */
+ xfs_ino_t inlast; /* last inode number */
+ int done;
+ int error;
+
+ /* done = 1 if there are more stats to get and if bulkstat */
+ /* should be called again (unused here, but used in dmapi) */
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
+ return -XFS_ERROR(EFAULT);
+
+ if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+ return -XFS_ERROR(EFAULT);
+
+ if ((count = bulkreq.icount) <= 0)
+ return -XFS_ERROR(EINVAL);
+
+ if (bulkreq.ubuffer == NULL)
+ return -XFS_ERROR(EINVAL);
+
+ if (cmd == XFS_IOC_FSINUMBERS)
+ error = xfs_inumbers(mp, &inlast, &count,
+ bulkreq.ubuffer, xfs_inumbers_fmt);
+ else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
+ error = xfs_bulkstat_single(mp, &inlast,
+ bulkreq.ubuffer, &done);
+ else /* XFS_IOC_FSBULKSTAT */
+ error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
+ sizeof(xfs_bstat_t), bulkreq.ubuffer,
+ &done);
+
+ if (error)
+ return -error;
+
+ if (bulkreq.ocount != NULL) {
+ if (copy_to_user(bulkreq.lastip, &inlast,
+ sizeof(xfs_ino_t)))
+ return -XFS_ERROR(EFAULT);
+
+ if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
+ return -XFS_ERROR(EFAULT);
+ }
+
+ return 0;
+}
+
+STATIC int
+xfs_ioc_fsgeometry_v1(
+ xfs_mount_t *mp,
+ void __user *arg)
+{
+ xfs_fsop_geom_t fsgeo;
+ int error;
+
+ error = xfs_fs_geometry(mp, &fsgeo, 3);
+ if (error)
+ return -error;
+
+ /*
+ * Caller should have passed an argument of type
+ * xfs_fsop_geom_v1_t. This is a proper subset of the
+ * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+ */
+ if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+STATIC int
+xfs_ioc_fsgeometry(
+ xfs_mount_t *mp,
+ void __user *arg)
+{
+ xfs_fsop_geom_t fsgeo;
+ int error;
+
+ error = xfs_fs_geometry(mp, &fsgeo, 4);
+ if (error)
+ return -error;
+
+ if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+/*
+ * Linux extended inode flags interface.
+ */
+
+STATIC unsigned int
+xfs_merge_ioc_xflags(
+ unsigned int flags,
+ unsigned int start)
+{
+ unsigned int xflags = start;
+
+ if (flags & FS_IMMUTABLE_FL)
+ xflags |= XFS_XFLAG_IMMUTABLE;
+ else
+ xflags &= ~XFS_XFLAG_IMMUTABLE;
+ if (flags & FS_APPEND_FL)
+ xflags |= XFS_XFLAG_APPEND;
+ else
+ xflags &= ~XFS_XFLAG_APPEND;
+ if (flags & FS_SYNC_FL)
+ xflags |= XFS_XFLAG_SYNC;
+ else
+ xflags &= ~XFS_XFLAG_SYNC;
+ if (flags & FS_NOATIME_FL)
+ xflags |= XFS_XFLAG_NOATIME;
+ else
+ xflags &= ~XFS_XFLAG_NOATIME;
+ if (flags & FS_NODUMP_FL)
+ xflags |= XFS_XFLAG_NODUMP;
+ else
+ xflags &= ~XFS_XFLAG_NODUMP;
+
+ return xflags;
+}
+
+STATIC unsigned int
+xfs_di2lxflags(
+ __uint16_t di_flags)
+{
+ unsigned int flags = 0;
+
+ if (di_flags & XFS_DIFLAG_IMMUTABLE)
+ flags |= FS_IMMUTABLE_FL;
+ if (di_flags & XFS_DIFLAG_APPEND)
+ flags |= FS_APPEND_FL;
+ if (di_flags & XFS_DIFLAG_SYNC)
+ flags |= FS_SYNC_FL;
+ if (di_flags & XFS_DIFLAG_NOATIME)
+ flags |= FS_NOATIME_FL;
+ if (di_flags & XFS_DIFLAG_NODUMP)
+ flags |= FS_NODUMP_FL;
+ return flags;
+}
+
+STATIC int
+xfs_ioc_fsgetxattr(
+ xfs_inode_t *ip,
+ int attr,
+ void __user *arg)
+{
+ struct fsxattr fa;
+
+ memset(&fa, 0, sizeof(struct fsxattr));
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ fa.fsx_xflags = xfs_ip2xflags(ip);
+ fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
+ fa.fsx_projid = xfs_get_projid(ip);
+
+ if (attr) {
+ if (ip->i_afp) {
+ if (ip->i_afp->if_flags & XFS_IFEXTENTS)
+ fa.fsx_nextents = ip->i_afp->if_bytes /
+ sizeof(xfs_bmbt_rec_t);
+ else
+ fa.fsx_nextents = ip->i_d.di_anextents;
+ } else
+ fa.fsx_nextents = 0;
+ } else {
+ if (ip->i_df.if_flags & XFS_IFEXTENTS)
+ fa.fsx_nextents = ip->i_df.if_bytes /
+ sizeof(xfs_bmbt_rec_t);
+ else
+ fa.fsx_nextents = ip->i_d.di_nextents;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (copy_to_user(arg, &fa, sizeof(fa)))
+ return -EFAULT;
+ return 0;
+}
+
+STATIC void
+xfs_set_diflags(
+ struct xfs_inode *ip,
+ unsigned int xflags)
+{
+ unsigned int di_flags;
+
+ /* can't set PREALLOC this way, just preserve it */
+ di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+ if (xflags & XFS_XFLAG_IMMUTABLE)
+ di_flags |= XFS_DIFLAG_IMMUTABLE;
+ if (xflags & XFS_XFLAG_APPEND)
+ di_flags |= XFS_DIFLAG_APPEND;
+ if (xflags & XFS_XFLAG_SYNC)
+ di_flags |= XFS_DIFLAG_SYNC;
+ if (xflags & XFS_XFLAG_NOATIME)
+ di_flags |= XFS_DIFLAG_NOATIME;
+ if (xflags & XFS_XFLAG_NODUMP)
+ di_flags |= XFS_DIFLAG_NODUMP;
+ if (xflags & XFS_XFLAG_PROJINHERIT)
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
+ if (xflags & XFS_XFLAG_NODEFRAG)
+ di_flags |= XFS_DIFLAG_NODEFRAG;
+ if (xflags & XFS_XFLAG_FILESTREAM)
+ di_flags |= XFS_DIFLAG_FILESTREAM;
+ if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+ if (xflags & XFS_XFLAG_RTINHERIT)
+ di_flags |= XFS_DIFLAG_RTINHERIT;
+ if (xflags & XFS_XFLAG_NOSYMLINKS)
+ di_flags |= XFS_DIFLAG_NOSYMLINKS;
+ if (xflags & XFS_XFLAG_EXTSZINHERIT)
+ di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+ } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+ if (xflags & XFS_XFLAG_REALTIME)
+ di_flags |= XFS_DIFLAG_REALTIME;
+ if (xflags & XFS_XFLAG_EXTSIZE)
+ di_flags |= XFS_DIFLAG_EXTSIZE;
+ }
+
+ ip->i_d.di_flags = di_flags;
+}
+
+STATIC void
+xfs_diflags_to_linux(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = VFS_I(ip);
+ unsigned int xflags = xfs_ip2xflags(ip);
+
+ if (xflags & XFS_XFLAG_IMMUTABLE)
+ inode->i_flags |= S_IMMUTABLE;
+ else
+ inode->i_flags &= ~S_IMMUTABLE;
+ if (xflags & XFS_XFLAG_APPEND)
+ inode->i_flags |= S_APPEND;
+ else
+ inode->i_flags &= ~S_APPEND;
+ if (xflags & XFS_XFLAG_SYNC)
+ inode->i_flags |= S_SYNC;
+ else
+ inode->i_flags &= ~S_SYNC;
+ if (xflags & XFS_XFLAG_NOATIME)
+ inode->i_flags |= S_NOATIME;
+ else
+ inode->i_flags &= ~S_NOATIME;
+}
+
+#define FSX_PROJID 1
+#define FSX_EXTSIZE 2
+#define FSX_XFLAGS 4
+#define FSX_NONBLOCK 8
+
+STATIC int
+xfs_ioctl_setattr(
+ xfs_inode_t *ip,
+ struct fsxattr *fa,
+ int mask)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ unsigned int lock_flags = 0;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *olddquot = NULL;
+ int code;
+
+ trace_xfs_ioctl_setattr(ip);
+
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return XFS_ERROR(EROFS);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ /*
+ * Disallow 32bit project ids when projid32bit feature is not enabled.
+ */
+ if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
+ !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
+ return XFS_ERROR(EINVAL);
+
+ /*
+ * If disk quotas is on, we make sure that the dquots do exist on disk,
+ * before we start any other transactions. Trying to do this later
+ * is messy. We don't care to take a readlock to look at the ids
+ * in inode here, because we can't hold it across the trans_reserve.
+ * If the IDs do change before we take the ilock, we're covered
+ * because the i_*dquot fields will get updated anyway.
+ */
+ if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
+ code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
+ ip->i_d.di_gid, fa->fsx_projid,
+ XFS_QMOPT_PQUOTA, &udqp, &gdqp);
+ if (code)
+ return code;
+ }
+
+ /*
+ * For the other attributes, we acquire the inode lock and
+ * first do an error checking pass.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+ code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+ if (code)
+ goto error_return;
+
+ lock_flags = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lock_flags);
+
+ /*
+ * CAP_FOWNER overrides the following restrictions:
+ *
+ * The user ID of the calling process must be equal
+ * to the file owner ID, except in cases where the
+ * CAP_FSETID capability is applicable.
+ */
+ if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+ code = XFS_ERROR(EPERM);
+ goto error_return;
+ }
+
+ /*
+ * Do a quota reservation only if projid is actually going to change.
+ */
+ if (mask & FSX_PROJID) {
+ if (XFS_IS_QUOTA_RUNNING(mp) &&
+ XFS_IS_PQUOTA_ON(mp) &&
+ xfs_get_projid(ip) != fa->fsx_projid) {
+ ASSERT(tp);
+ code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
+ capable(CAP_FOWNER) ?
+ XFS_QMOPT_FORCE_RES : 0);
+ if (code) /* out of quota */
+ goto error_return;
+ }
+ }
+
+ if (mask & FSX_EXTSIZE) {
+ /*
+ * Can't change extent size if any extents are allocated.
+ */
+ if (ip->i_d.di_nextents &&
+ ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+ fa->fsx_extsize)) {
+ code = XFS_ERROR(EINVAL); /* EFBIG? */
+ goto error_return;
+ }
+
+ /*
+ * Extent size must be a multiple of the appropriate block
+ * size, if set at all. It must also be smaller than the
+ * maximum extent size supported by the filesystem.
+ *
+ * Also, for non-realtime files, limit the extent size hint to
+ * half the size of the AGs in the filesystem so alignment
+ * doesn't result in extents larger than an AG.
+ */
+ if (fa->fsx_extsize != 0) {
+ xfs_extlen_t size;
+ xfs_fsblock_t extsize_fsb;
+
+ extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+ if (extsize_fsb > MAXEXTLEN) {
+ code = XFS_ERROR(EINVAL);
+ goto error_return;
+ }
+
+ if (XFS_IS_REALTIME_INODE(ip) ||
+ ((mask & FSX_XFLAGS) &&
+ (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
+ size = mp->m_sb.sb_rextsize <<
+ mp->m_sb.sb_blocklog;
+ } else {
+ size = mp->m_sb.sb_blocksize;
+ if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
+ code = XFS_ERROR(EINVAL);
+ goto error_return;
+ }
+ }
+
+ if (fa->fsx_extsize % size) {
+ code = XFS_ERROR(EINVAL);
+ goto error_return;
+ }
+ }
+ }
+
+
+ if (mask & FSX_XFLAGS) {
+ /*
+ * Can't change realtime flag if any extents are allocated.
+ */
+ if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+ (XFS_IS_REALTIME_INODE(ip)) !=
+ (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+ code = XFS_ERROR(EINVAL); /* EFBIG? */
+ goto error_return;
+ }
+
+ /*
+ * If realtime flag is set then must have realtime data.
+ */
+ if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+ if ((mp->m_sb.sb_rblocks == 0) ||
+ (mp->m_sb.sb_rextsize == 0) ||
+ (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
+ code = XFS_ERROR(EINVAL);
+ goto error_return;
+ }
+ }
+
+ /*
+ * Can't modify an immutable/append-only file unless
+ * we have appropriate permission.
+ */
+ if ((ip->i_d.di_flags &
+ (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
+ (fa->fsx_xflags &
+ (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+ !capable(CAP_LINUX_IMMUTABLE)) {
+ code = XFS_ERROR(EPERM);
+ goto error_return;
+ }
+ }
+
+ xfs_trans_ijoin(tp, ip);
+
+ /*
+ * Change file ownership. Must be the owner or privileged.
+ */
+ if (mask & FSX_PROJID) {
+ /*
+ * CAP_FSETID overrides the following restrictions:
+ *
+ * The set-user-ID and set-group-ID bits of a file will be
+ * cleared upon successful return from chown()
+ */
+ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+ !capable(CAP_FSETID))
+ ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+
+ /*
+ * Change the ownerships and register quota modifications
+ * in the transaction.
+ */
+ if (xfs_get_projid(ip) != fa->fsx_projid) {
+ if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
+ olddquot = xfs_qm_vop_chown(tp, ip,
+ &ip->i_gdquot, gdqp);
+ }
+ xfs_set_projid(ip, fa->fsx_projid);
+
+ /*
+ * We may have to rev the inode as well as
+ * the superblock version number since projids didn't
+ * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
+ */
+ if (ip->i_d.di_version == 1)
+ xfs_bump_ino_vers2(tp, ip);
+ }
+
+ }
+
+ if (mask & FSX_EXTSIZE)
+ ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+ if (mask & FSX_XFLAGS) {
+ xfs_set_diflags(ip, fa->fsx_xflags);
+ xfs_diflags_to_linux(ip);
+ }
+
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ XFS_STATS_INC(xs_ig_attrchg);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * transaction goes to disk before returning to the user.
+ * This is slightly sub-optimal in that truncates require
+ * two sync transactions instead of one for wsync filesystems.
+ * One for the truncate and one for the timestamps since we
+ * don't want to change the timestamps unless we're sure the
+ * truncate worked. Truncates are less than 1% of the laddis
+ * mix so this probably isn't worth the trouble to optimize.
+ */
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(tp);
+ code = xfs_trans_commit(tp, 0);
+ xfs_iunlock(ip, lock_flags);
+
+ /*
+ * Release any dquot(s) the inode had kept before chown.
+ */
+ xfs_qm_dqrele(olddquot);
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+
+ return code;
+
+ error_return:
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_trans_cancel(tp, 0);
+ if (lock_flags)
+ xfs_iunlock(ip, lock_flags);
+ return code;
+}
+
+STATIC int
+xfs_ioc_fssetxattr(
+ xfs_inode_t *ip,
+ struct file *filp,
+ void __user *arg)
+{
+ struct fsxattr fa;
+ unsigned int mask;
+
+ if (copy_from_user(&fa, arg, sizeof(fa)))
+ return -EFAULT;
+
+ mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
+ if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+ mask |= FSX_NONBLOCK;
+
+ return -xfs_ioctl_setattr(ip, &fa, mask);
+}
+
+STATIC int
+xfs_ioc_getxflags(
+ xfs_inode_t *ip,
+ void __user *arg)
+{
+ unsigned int flags;
+
+ flags = xfs_di2lxflags(ip->i_d.di_flags);
+ if (copy_to_user(arg, &flags, sizeof(flags)))
+ return -EFAULT;
+ return 0;
+}
+
+STATIC int
+xfs_ioc_setxflags(
+ xfs_inode_t *ip,
+ struct file *filp,
+ void __user *arg)
+{
+ struct fsxattr fa;
+ unsigned int flags;
+ unsigned int mask;
+
+ if (copy_from_user(&flags, arg, sizeof(flags)))
+ return -EFAULT;
+
+ if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+ FS_NOATIME_FL | FS_NODUMP_FL | \
+ FS_SYNC_FL))
+ return -EOPNOTSUPP;
+
+ mask = FSX_XFLAGS;
+ if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+ mask |= FSX_NONBLOCK;
+ fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
+
+ return -xfs_ioctl_setattr(ip, &fa, mask);
+}
+
+STATIC int
+xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
+{
+ struct getbmap __user *base = *ap;
+
+ /* copy only getbmap portion (not getbmapx) */
+ if (copy_to_user(base, bmv, sizeof(struct getbmap)))
+ return XFS_ERROR(EFAULT);
+
+ *ap += sizeof(struct getbmap);
+ return 0;
+}
+
+STATIC int
+xfs_ioc_getbmap(
+ struct xfs_inode *ip,
+ int ioflags,
+ unsigned int cmd,
+ void __user *arg)
+{
+ struct getbmapx bmx;
+ int error;
+
+ if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
+ return -XFS_ERROR(EFAULT);
+
+ if (bmx.bmv_count < 2)
+ return -XFS_ERROR(EINVAL);
+
+ bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+ if (ioflags & IO_INVIS)
+ bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
+
+ error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+ (struct getbmap *)arg+1);
+ if (error)
+ return -error;
+
+ /* copy back header - only size of getbmap */
+ if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+STATIC int
+xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
+{
+ struct getbmapx __user *base = *ap;
+
+ if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
+ return XFS_ERROR(EFAULT);
+
+ *ap += sizeof(struct getbmapx);
+ return 0;
+}
+
+STATIC int
+xfs_ioc_getbmapx(
+ struct xfs_inode *ip,
+ void __user *arg)
+{
+ struct getbmapx bmx;
+ int error;
+
+ if (copy_from_user(&bmx, arg, sizeof(bmx)))
+ return -XFS_ERROR(EFAULT);
+
+ if (bmx.bmv_count < 2)
+ return -XFS_ERROR(EINVAL);
+
+ if (bmx.bmv_iflags & (~BMV_IF_VALID))
+ return -XFS_ERROR(EINVAL);
+
+ error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
+ (struct getbmapx *)arg+1);
+ if (error)
+ return -error;
+
+ /* copy back header */
+ if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
+ return -XFS_ERROR(EFAULT);
+
+ return 0;
+}
+
+/*
+ * Note: some of the ioctl's return positive numbers as a
+ * byte count indicating success, such as readlink_by_handle.
+ * So we don't "sign flip" like most other routines. This means
+ * true errors need to be returned as a negative value.
+ */
+long
+xfs_file_ioctl(
+ struct file *filp,
+ unsigned int cmd,
+ unsigned long p)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ void __user *arg = (void __user *)p;
+ int ioflags = 0;
+ int error;
+
+ if (filp->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
+
+ trace_xfs_file_ioctl(ip);
+
+ switch (cmd) {
+ case FITRIM:
+ return xfs_ioc_trim(mp, arg);
+ case XFS_IOC_ALLOCSP:
+ case XFS_IOC_FREESP:
+ case XFS_IOC_RESVSP:
+ case XFS_IOC_UNRESVSP:
+ case XFS_IOC_ALLOCSP64:
+ case XFS_IOC_FREESP64:
+ case XFS_IOC_RESVSP64:
+ case XFS_IOC_UNRESVSP64:
+ case XFS_IOC_ZERO_RANGE: {
+ xfs_flock64_t bf;
+
+ if (copy_from_user(&bf, arg, sizeof(bf)))
+ return -XFS_ERROR(EFAULT);
+ return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+ }
+ case XFS_IOC_DIOINFO: {
+ struct dioattr da;
+ xfs_buftarg_t *target =
+ XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp;
+
+ da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+ da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
+
+ if (copy_to_user(arg, &da, sizeof(da)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+ }
+
+ case XFS_IOC_FSBULKSTAT_SINGLE:
+ case XFS_IOC_FSBULKSTAT:
+ case XFS_IOC_FSINUMBERS:
+ return xfs_ioc_bulkstat(mp, cmd, arg);
+
+ case XFS_IOC_FSGEOMETRY_V1:
+ return xfs_ioc_fsgeometry_v1(mp, arg);
+
+ case XFS_IOC_FSGEOMETRY:
+ return xfs_ioc_fsgeometry(mp, arg);
+
+ case XFS_IOC_GETVERSION:
+ return put_user(inode->i_generation, (int __user *)arg);
+
+ case XFS_IOC_FSGETXATTR:
+ return xfs_ioc_fsgetxattr(ip, 0, arg);
+ case XFS_IOC_FSGETXATTRA:
+ return xfs_ioc_fsgetxattr(ip, 1, arg);
+ case XFS_IOC_FSSETXATTR:
+ return xfs_ioc_fssetxattr(ip, filp, arg);
+ case XFS_IOC_GETXFLAGS:
+ return xfs_ioc_getxflags(ip, arg);
+ case XFS_IOC_SETXFLAGS:
+ return xfs_ioc_setxflags(ip, filp, arg);
+
+ case XFS_IOC_FSSETDM: {
+ struct fsdmidata dmi;
+
+ if (copy_from_user(&dmi, arg, sizeof(dmi)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
+ dmi.fsd_dmstate);
+ return -error;
+ }
+
+ case XFS_IOC_GETBMAP:
+ case XFS_IOC_GETBMAPA:
+ return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+
+ case XFS_IOC_GETBMAPX:
+ return xfs_ioc_getbmapx(ip, arg);
+
+ case XFS_IOC_FD_TO_HANDLE:
+ case XFS_IOC_PATH_TO_HANDLE:
+ case XFS_IOC_PATH_TO_FSHANDLE: {
+ xfs_fsop_handlereq_t hreq;
+
+ if (copy_from_user(&hreq, arg, sizeof(hreq)))
+ return -XFS_ERROR(EFAULT);
+ return xfs_find_handle(cmd, &hreq);
+ }
+ case XFS_IOC_OPEN_BY_HANDLE: {
+ xfs_fsop_handlereq_t hreq;
+
+ if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+ return xfs_open_by_handle(filp, &hreq);
+ }
+ case XFS_IOC_FSSETDM_BY_HANDLE:
+ return xfs_fssetdm_by_handle(filp, arg);
+
+ case XFS_IOC_READLINK_BY_HANDLE: {
+ xfs_fsop_handlereq_t hreq;
+
+ if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+ return xfs_readlink_by_handle(filp, &hreq);
+ }
+ case XFS_IOC_ATTRLIST_BY_HANDLE:
+ return xfs_attrlist_by_handle(filp, arg);
+
+ case XFS_IOC_ATTRMULTI_BY_HANDLE:
+ return xfs_attrmulti_by_handle(filp, arg);
+
+ case XFS_IOC_SWAPEXT: {
+ struct xfs_swapext sxp;
+
+ if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
+ return -XFS_ERROR(EFAULT);
+ error = xfs_swapext(&sxp);
+ return -error;
+ }
+
+ case XFS_IOC_FSCOUNTS: {
+ xfs_fsop_counts_t out;
+
+ error = xfs_fs_counts(mp, &out);
+ if (error)
+ return -error;
+
+ if (copy_to_user(arg, &out, sizeof(out)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+ }
+
+ case XFS_IOC_SET_RESBLKS: {
+ xfs_fsop_resblks_t inout;
+ __uint64_t in;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return -XFS_ERROR(EROFS);
+
+ if (copy_from_user(&inout, arg, sizeof(inout)))
+ return -XFS_ERROR(EFAULT);
+
+ /* input parameter is passed in resblks field of structure */
+ in = inout.resblks;
+ error = xfs_reserve_blocks(mp, &in, &inout);
+ if (error)
+ return -error;
+
+ if (copy_to_user(arg, &inout, sizeof(inout)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+ }
+
+ case XFS_IOC_GET_RESBLKS: {
+ xfs_fsop_resblks_t out;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ error = xfs_reserve_blocks(mp, NULL, &out);
+ if (error)
+ return -error;
+
+ if (copy_to_user(arg, &out, sizeof(out)))
+ return -XFS_ERROR(EFAULT);
+
+ return 0;
+ }
+
+ case XFS_IOC_FSGROWFSDATA: {
+ xfs_growfs_data_t in;
+
+ if (copy_from_user(&in, arg, sizeof(in)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_growfs_data(mp, &in);
+ return -error;
+ }
+
+ case XFS_IOC_FSGROWFSLOG: {
+ xfs_growfs_log_t in;
+
+ if (copy_from_user(&in, arg, sizeof(in)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_growfs_log(mp, &in);
+ return -error;
+ }
+
+ case XFS_IOC_FSGROWFSRT: {
+ xfs_growfs_rt_t in;
+
+ if (copy_from_user(&in, arg, sizeof(in)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_growfs_rt(mp, &in);
+ return -error;
+ }
+
+ case XFS_IOC_GOINGDOWN: {
+ __uint32_t in;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(in, (__uint32_t __user *)arg))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_fs_goingdown(mp, in);
+ return -error;
+ }
+
+ case XFS_IOC_ERROR_INJECTION: {
+ xfs_error_injection_t in;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&in, arg, sizeof(in)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_errortag_add(in.errtag, mp);
+ return -error;
+ }
+
+ case XFS_IOC_ERROR_CLEARALL:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ error = xfs_errortag_clearall(mp, 1);
+ return -error;
+
+ default:
+ return -ENOTTY;
+ }
+}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
new file mode 100644
index 00000000..d56173b3
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_IOCTL_H__
+#define __XFS_IOCTL_H__
+
+extern int
+xfs_ioc_space(
+ struct xfs_inode *ip,
+ struct inode *inode,
+ struct file *filp,
+ int ioflags,
+ unsigned int cmd,
+ xfs_flock64_t *bf);
+
+extern int
+xfs_find_handle(
+ unsigned int cmd,
+ xfs_fsop_handlereq_t *hreq);
+
+extern int
+xfs_open_by_handle(
+ struct file *parfilp,
+ xfs_fsop_handlereq_t *hreq);
+
+extern int
+xfs_readlink_by_handle(
+ struct file *parfilp,
+ xfs_fsop_handlereq_t *hreq);
+
+extern int
+xfs_attrmulti_attr_get(
+ struct inode *inode,
+ unsigned char *name,
+ unsigned char __user *ubuf,
+ __uint32_t *len,
+ __uint32_t flags);
+
+extern int
+xfs_attrmulti_attr_set(
+ struct inode *inode,
+ unsigned char *name,
+ const unsigned char __user *ubuf,
+ __uint32_t len,
+ __uint32_t flags);
+
+extern int
+xfs_attrmulti_attr_remove(
+ struct inode *inode,
+ unsigned char *name,
+ __uint32_t flags);
+
+extern struct dentry *
+xfs_handle_to_dentry(
+ struct file *parfilp,
+ void __user *uhandle,
+ u32 hlen);
+
+extern long
+xfs_file_ioctl(
+ struct file *filp,
+ unsigned int cmd,
+ unsigned long p);
+
+extern long
+xfs_file_compat_ioctl(
+ struct file *file,
+ unsigned int cmd,
+ unsigned long arg);
+
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
new file mode 100644
index 00000000..54e623bf
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <linux/compat.h>
+#include <linux/ioctl.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_vnode.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_dfrag.h"
+#include "xfs_vnodeops.h"
+#include "xfs_fsops.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
+#include "xfs_ioctl32.h"
+#include "xfs_trace.h"
+
+#define _NATIVE_IOC(cmd, type) \
+ _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
+
+#ifdef BROKEN_X86_ALIGNMENT
+STATIC int
+xfs_compat_flock64_copyin(
+ xfs_flock64_t *bf,
+ compat_xfs_flock64_t __user *arg32)
+{
+ if (get_user(bf->l_type, &arg32->l_type) ||
+ get_user(bf->l_whence, &arg32->l_whence) ||
+ get_user(bf->l_start, &arg32->l_start) ||
+ get_user(bf->l_len, &arg32->l_len) ||
+ get_user(bf->l_sysid, &arg32->l_sysid) ||
+ get_user(bf->l_pid, &arg32->l_pid) ||
+ copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+STATIC int
+xfs_compat_ioc_fsgeometry_v1(
+ struct xfs_mount *mp,
+ compat_xfs_fsop_geom_v1_t __user *arg32)
+{
+ xfs_fsop_geom_t fsgeo;
+ int error;
+
+ error = xfs_fs_geometry(mp, &fsgeo, 3);
+ if (error)
+ return -error;
+ /* The 32-bit variant simply has some padding at the end */
+ if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+STATIC int
+xfs_compat_growfs_data_copyin(
+ struct xfs_growfs_data *in,
+ compat_xfs_growfs_data_t __user *arg32)
+{
+ if (get_user(in->newblocks, &arg32->newblocks) ||
+ get_user(in->imaxpct, &arg32->imaxpct))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+STATIC int
+xfs_compat_growfs_rt_copyin(
+ struct xfs_growfs_rt *in,
+ compat_xfs_growfs_rt_t __user *arg32)
+{
+ if (get_user(in->newblocks, &arg32->newblocks) ||
+ get_user(in->extsize, &arg32->extsize))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+STATIC int
+xfs_inumbers_fmt_compat(
+ void __user *ubuffer,
+ const xfs_inogrp_t *buffer,
+ long count,
+ long *written)
+{
+ compat_xfs_inogrp_t __user *p32 = ubuffer;
+ long i;
+
+ for (i = 0; i < count; i++) {
+ if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) ||
+ put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
+ put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask))
+ return -XFS_ERROR(EFAULT);
+ }
+ *written = count * sizeof(*p32);
+ return 0;
+}
+
+#else
+#define xfs_inumbers_fmt_compat xfs_inumbers_fmt
+#endif /* BROKEN_X86_ALIGNMENT */
+
+STATIC int
+xfs_ioctl32_bstime_copyin(
+ xfs_bstime_t *bstime,
+ compat_xfs_bstime_t __user *bstime32)
+{
+ compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */
+
+ if (get_user(sec32, &bstime32->tv_sec) ||
+ get_user(bstime->tv_nsec, &bstime32->tv_nsec))
+ return -XFS_ERROR(EFAULT);
+ bstime->tv_sec = sec32;
+ return 0;
+}
+
+/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+STATIC int
+xfs_ioctl32_bstat_copyin(
+ xfs_bstat_t *bstat,
+ compat_xfs_bstat_t __user *bstat32)
+{
+ if (get_user(bstat->bs_ino, &bstat32->bs_ino) ||
+ get_user(bstat->bs_mode, &bstat32->bs_mode) ||
+ get_user(bstat->bs_nlink, &bstat32->bs_nlink) ||
+ get_user(bstat->bs_uid, &bstat32->bs_uid) ||
+ get_user(bstat->bs_gid, &bstat32->bs_gid) ||
+ get_user(bstat->bs_rdev, &bstat32->bs_rdev) ||
+ get_user(bstat->bs_blksize, &bstat32->bs_blksize) ||
+ get_user(bstat->bs_size, &bstat32->bs_size) ||
+ xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
+ xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
+ xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
+ get_user(bstat->bs_blocks, &bstat32->bs_size) ||
+ get_user(bstat->bs_xflags, &bstat32->bs_size) ||
+ get_user(bstat->bs_extsize, &bstat32->bs_extsize) ||
+ get_user(bstat->bs_extents, &bstat32->bs_extents) ||
+ get_user(bstat->bs_gen, &bstat32->bs_gen) ||
+ get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
+ get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
+ get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
+ get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) ||
+ get_user(bstat->bs_aextents, &bstat32->bs_aextents))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+/* XFS_IOC_FSBULKSTAT and friends */
+
+STATIC int
+xfs_bstime_store_compat(
+ compat_xfs_bstime_t __user *p32,
+ const xfs_bstime_t *p)
+{
+ __s32 sec32;
+
+ sec32 = p->tv_sec;
+ if (put_user(sec32, &p32->tv_sec) ||
+ put_user(p->tv_nsec, &p32->tv_nsec))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+/* Return 0 on success or positive error (to xfs_bulkstat()) */
+STATIC int
+xfs_bulkstat_one_fmt_compat(
+ void __user *ubuffer,
+ int ubsize,
+ int *ubused,
+ const xfs_bstat_t *buffer)
+{
+ compat_xfs_bstat_t __user *p32 = ubuffer;
+
+ if (ubsize < sizeof(*p32))
+ return XFS_ERROR(ENOMEM);
+
+ if (put_user(buffer->bs_ino, &p32->bs_ino) ||
+ put_user(buffer->bs_mode, &p32->bs_mode) ||
+ put_user(buffer->bs_nlink, &p32->bs_nlink) ||
+ put_user(buffer->bs_uid, &p32->bs_uid) ||
+ put_user(buffer->bs_gid, &p32->bs_gid) ||
+ put_user(buffer->bs_rdev, &p32->bs_rdev) ||
+ put_user(buffer->bs_blksize, &p32->bs_blksize) ||
+ put_user(buffer->bs_size, &p32->bs_size) ||
+ xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
+ xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
+ xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
+ put_user(buffer->bs_blocks, &p32->bs_blocks) ||
+ put_user(buffer->bs_xflags, &p32->bs_xflags) ||
+ put_user(buffer->bs_extsize, &p32->bs_extsize) ||
+ put_user(buffer->bs_extents, &p32->bs_extents) ||
+ put_user(buffer->bs_gen, &p32->bs_gen) ||
+ put_user(buffer->bs_projid, &p32->bs_projid) ||
+ put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) ||
+ put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
+ put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
+ put_user(buffer->bs_aextents, &p32->bs_aextents))
+ return XFS_ERROR(EFAULT);
+ if (ubused)
+ *ubused = sizeof(*p32);
+ return 0;
+}
+
+STATIC int
+xfs_bulkstat_one_compat(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t ino, /* inode number to get data for */
+ void __user *buffer, /* buffer to place output in */
+ int ubsize, /* size of buffer */
+ int *ubused, /* bytes used by me */
+ int *stat) /* BULKSTAT_RV_... */
+{
+ return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+ xfs_bulkstat_one_fmt_compat,
+ ubused, stat);
+}
+
+/* copied from xfs_ioctl.c */
+STATIC int
+xfs_compat_ioc_bulkstat(
+ xfs_mount_t *mp,
+ unsigned int cmd,
+ compat_xfs_fsop_bulkreq_t __user *p32)
+{
+ u32 addr;
+ xfs_fsop_bulkreq_t bulkreq;
+ int count; /* # of records returned */
+ xfs_ino_t inlast; /* last inode number */
+ int done;
+ int error;
+
+ /* done = 1 if there are more stats to get and if bulkstat */
+ /* should be called again (unused here, but used in dmapi) */
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ if (get_user(addr, &p32->lastip))
+ return -XFS_ERROR(EFAULT);
+ bulkreq.lastip = compat_ptr(addr);
+ if (get_user(bulkreq.icount, &p32->icount) ||
+ get_user(addr, &p32->ubuffer))
+ return -XFS_ERROR(EFAULT);
+ bulkreq.ubuffer = compat_ptr(addr);
+ if (get_user(addr, &p32->ocount))
+ return -XFS_ERROR(EFAULT);
+ bulkreq.ocount = compat_ptr(addr);
+
+ if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+ return -XFS_ERROR(EFAULT);
+
+ if ((count = bulkreq.icount) <= 0)
+ return -XFS_ERROR(EINVAL);
+
+ if (bulkreq.ubuffer == NULL)
+ return -XFS_ERROR(EINVAL);
+
+ if (cmd == XFS_IOC_FSINUMBERS_32) {
+ error = xfs_inumbers(mp, &inlast, &count,
+ bulkreq.ubuffer, xfs_inumbers_fmt_compat);
+ } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
+ int res;
+
+ error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
+ sizeof(compat_xfs_bstat_t), 0, &res);
+ } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
+ error = xfs_bulkstat(mp, &inlast, &count,
+ xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
+ bulkreq.ubuffer, &done);
+ } else
+ error = XFS_ERROR(EINVAL);
+ if (error)
+ return -error;
+
+ if (bulkreq.ocount != NULL) {
+ if (copy_to_user(bulkreq.lastip, &inlast,
+ sizeof(xfs_ino_t)))
+ return -XFS_ERROR(EFAULT);
+
+ if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
+ return -XFS_ERROR(EFAULT);
+ }
+
+ return 0;
+}
+
+STATIC int
+xfs_compat_handlereq_copyin(
+ xfs_fsop_handlereq_t *hreq,
+ compat_xfs_fsop_handlereq_t __user *arg32)
+{
+ compat_xfs_fsop_handlereq_t hreq32;
+
+ if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+
+ hreq->fd = hreq32.fd;
+ hreq->path = compat_ptr(hreq32.path);
+ hreq->oflags = hreq32.oflags;
+ hreq->ihandle = compat_ptr(hreq32.ihandle);
+ hreq->ihandlen = hreq32.ihandlen;
+ hreq->ohandle = compat_ptr(hreq32.ohandle);
+ hreq->ohandlen = compat_ptr(hreq32.ohandlen);
+
+ return 0;
+}
+
+STATIC struct dentry *
+xfs_compat_handlereq_to_dentry(
+ struct file *parfilp,
+ compat_xfs_fsop_handlereq_t *hreq)
+{
+ return xfs_handle_to_dentry(parfilp,
+ compat_ptr(hreq->ihandle), hreq->ihandlen);
+}
+
+STATIC int
+xfs_compat_attrlist_by_handle(
+ struct file *parfilp,
+ void __user *arg)
+{
+ int error;
+ attrlist_cursor_kern_t *cursor;
+ compat_xfs_fsop_attrlist_handlereq_t al_hreq;
+ struct dentry *dentry;
+ char *kbuf;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+ if (copy_from_user(&al_hreq, arg,
+ sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+ if (al_hreq.buflen > XATTR_LIST_MAX)
+ return -XFS_ERROR(EINVAL);
+
+ /*
+ * Reject flags, only allow namespaces.
+ */
+ if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+ return -XFS_ERROR(EINVAL);
+
+ dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ error = -ENOMEM;
+ kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+ if (!kbuf)
+ goto out_dput;
+
+ cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+ error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+ al_hreq.flags, cursor);
+ if (error)
+ goto out_kfree;
+
+ if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
+ error = -EFAULT;
+
+ out_kfree:
+ kfree(kbuf);
+ out_dput:
+ dput(dentry);
+ return error;
+}
+
+STATIC int
+xfs_compat_attrmulti_by_handle(
+ struct file *parfilp,
+ void __user *arg)
+{
+ int error;
+ compat_xfs_attr_multiop_t *ops;
+ compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
+ struct dentry *dentry;
+ unsigned int i, size;
+ unsigned char *attr_name;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+ if (copy_from_user(&am_hreq, arg,
+ sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+
+ /* overflow check */
+ if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
+ return -E2BIG;
+
+ dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ error = E2BIG;
+ size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
+ if (!size || size > 16 * PAGE_SIZE)
+ goto out_dput;
+
+ ops = memdup_user(compat_ptr(am_hreq.ops), size);
+ if (IS_ERR(ops)) {
+ error = PTR_ERR(ops);
+ goto out_dput;
+ }
+
+ attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+ if (!attr_name)
+ goto out_kfree_ops;
+
+ error = 0;
+ for (i = 0; i < am_hreq.opcount; i++) {
+ ops[i].am_error = strncpy_from_user((char *)attr_name,
+ compat_ptr(ops[i].am_attrname),
+ MAXNAMELEN);
+ if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+ error = -ERANGE;
+ if (ops[i].am_error < 0)
+ break;
+
+ switch (ops[i].am_opcode) {
+ case ATTR_OP_GET:
+ ops[i].am_error = xfs_attrmulti_attr_get(
+ dentry->d_inode, attr_name,
+ compat_ptr(ops[i].am_attrvalue),
+ &ops[i].am_length, ops[i].am_flags);
+ break;
+ case ATTR_OP_SET:
+ ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ if (ops[i].am_error)
+ break;
+ ops[i].am_error = xfs_attrmulti_attr_set(
+ dentry->d_inode, attr_name,
+ compat_ptr(ops[i].am_attrvalue),
+ ops[i].am_length, ops[i].am_flags);
+ mnt_drop_write(parfilp->f_path.mnt);
+ break;
+ case ATTR_OP_REMOVE:
+ ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ if (ops[i].am_error)
+ break;
+ ops[i].am_error = xfs_attrmulti_attr_remove(
+ dentry->d_inode, attr_name,
+ ops[i].am_flags);
+ mnt_drop_write(parfilp->f_path.mnt);
+ break;
+ default:
+ ops[i].am_error = EINVAL;
+ }
+ }
+
+ if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
+ error = XFS_ERROR(EFAULT);
+
+ kfree(attr_name);
+ out_kfree_ops:
+ kfree(ops);
+ out_dput:
+ dput(dentry);
+ return -error;
+}
+
+STATIC int
+xfs_compat_fssetdm_by_handle(
+ struct file *parfilp,
+ void __user *arg)
+{
+ int error;
+ struct fsdmidata fsd;
+ compat_xfs_fsop_setdm_handlereq_t dmhreq;
+ struct dentry *dentry;
+
+ if (!capable(CAP_MKNOD))
+ return -XFS_ERROR(EPERM);
+ if (copy_from_user(&dmhreq, arg,
+ sizeof(compat_xfs_fsop_setdm_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+
+ dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
+ error = -XFS_ERROR(EPERM);
+ goto out;
+ }
+
+ if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
+ error = -XFS_ERROR(EFAULT);
+ goto out;
+ }
+
+ error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+ fsd.fsd_dmstate);
+
+out:
+ dput(dentry);
+ return error;
+}
+
+long
+xfs_file_compat_ioctl(
+ struct file *filp,
+ unsigned cmd,
+ unsigned long p)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ void __user *arg = (void __user *)p;
+ int ioflags = 0;
+ int error;
+
+ if (filp->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
+
+ trace_xfs_file_compat_ioctl(ip);
+
+ switch (cmd) {
+ /* No size or alignment issues on any arch */
+ case XFS_IOC_DIOINFO:
+ case XFS_IOC_FSGEOMETRY:
+ case XFS_IOC_FSGETXATTR:
+ case XFS_IOC_FSSETXATTR:
+ case XFS_IOC_FSGETXATTRA:
+ case XFS_IOC_FSSETDM:
+ case XFS_IOC_GETBMAP:
+ case XFS_IOC_GETBMAPA:
+ case XFS_IOC_GETBMAPX:
+ case XFS_IOC_FSCOUNTS:
+ case XFS_IOC_SET_RESBLKS:
+ case XFS_IOC_GET_RESBLKS:
+ case XFS_IOC_FSGROWFSLOG:
+ case XFS_IOC_GOINGDOWN:
+ case XFS_IOC_ERROR_INJECTION:
+ case XFS_IOC_ERROR_CLEARALL:
+ return xfs_file_ioctl(filp, cmd, p);
+#ifndef BROKEN_X86_ALIGNMENT
+ /* These are handled fine if no alignment issues */
+ case XFS_IOC_ALLOCSP:
+ case XFS_IOC_FREESP:
+ case XFS_IOC_RESVSP:
+ case XFS_IOC_UNRESVSP:
+ case XFS_IOC_ALLOCSP64:
+ case XFS_IOC_FREESP64:
+ case XFS_IOC_RESVSP64:
+ case XFS_IOC_UNRESVSP64:
+ case XFS_IOC_FSGEOMETRY_V1:
+ case XFS_IOC_FSGROWFSDATA:
+ case XFS_IOC_FSGROWFSRT:
+ case XFS_IOC_ZERO_RANGE:
+ return xfs_file_ioctl(filp, cmd, p);
+#else
+ case XFS_IOC_ALLOCSP_32:
+ case XFS_IOC_FREESP_32:
+ case XFS_IOC_ALLOCSP64_32:
+ case XFS_IOC_FREESP64_32:
+ case XFS_IOC_RESVSP_32:
+ case XFS_IOC_UNRESVSP_32:
+ case XFS_IOC_RESVSP64_32:
+ case XFS_IOC_UNRESVSP64_32:
+ case XFS_IOC_ZERO_RANGE_32: {
+ struct xfs_flock64 bf;
+
+ if (xfs_compat_flock64_copyin(&bf, arg))
+ return -XFS_ERROR(EFAULT);
+ cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
+ return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+ }
+ case XFS_IOC_FSGEOMETRY_V1_32:
+ return xfs_compat_ioc_fsgeometry_v1(mp, arg);
+ case XFS_IOC_FSGROWFSDATA_32: {
+ struct xfs_growfs_data in;
+
+ if (xfs_compat_growfs_data_copyin(&in, arg))
+ return -XFS_ERROR(EFAULT);
+ error = xfs_growfs_data(mp, &in);
+ return -error;
+ }
+ case XFS_IOC_FSGROWFSRT_32: {
+ struct xfs_growfs_rt in;
+
+ if (xfs_compat_growfs_rt_copyin(&in, arg))
+ return -XFS_ERROR(EFAULT);
+ error = xfs_growfs_rt(mp, &in);
+ return -error;
+ }
+#endif
+ /* long changes size, but xfs only copiese out 32 bits */
+ case XFS_IOC_GETXFLAGS_32:
+ case XFS_IOC_SETXFLAGS_32:
+ case XFS_IOC_GETVERSION_32:
+ cmd = _NATIVE_IOC(cmd, long);
+ return xfs_file_ioctl(filp, cmd, p);
+ case XFS_IOC_SWAPEXT_32: {
+ struct xfs_swapext sxp;
+ struct compat_xfs_swapext __user *sxu = arg;
+
+ /* Bulk copy in up to the sx_stat field, then copy bstat */
+ if (copy_from_user(&sxp, sxu,
+ offsetof(struct xfs_swapext, sx_stat)) ||
+ xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
+ return -XFS_ERROR(EFAULT);
+ error = xfs_swapext(&sxp);
+ return -error;
+ }
+ case XFS_IOC_FSBULKSTAT_32:
+ case XFS_IOC_FSBULKSTAT_SINGLE_32:
+ case XFS_IOC_FSINUMBERS_32:
+ return xfs_compat_ioc_bulkstat(mp, cmd, arg);
+ case XFS_IOC_FD_TO_HANDLE_32:
+ case XFS_IOC_PATH_TO_HANDLE_32:
+ case XFS_IOC_PATH_TO_FSHANDLE_32: {
+ struct xfs_fsop_handlereq hreq;
+
+ if (xfs_compat_handlereq_copyin(&hreq, arg))
+ return -XFS_ERROR(EFAULT);
+ cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
+ return xfs_find_handle(cmd, &hreq);
+ }
+ case XFS_IOC_OPEN_BY_HANDLE_32: {
+ struct xfs_fsop_handlereq hreq;
+
+ if (xfs_compat_handlereq_copyin(&hreq, arg))
+ return -XFS_ERROR(EFAULT);
+ return xfs_open_by_handle(filp, &hreq);
+ }
+ case XFS_IOC_READLINK_BY_HANDLE_32: {
+ struct xfs_fsop_handlereq hreq;
+
+ if (xfs_compat_handlereq_copyin(&hreq, arg))
+ return -XFS_ERROR(EFAULT);
+ return xfs_readlink_by_handle(filp, &hreq);
+ }
+ case XFS_IOC_ATTRLIST_BY_HANDLE_32:
+ return xfs_compat_attrlist_by_handle(filp, arg);
+ case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
+ return xfs_compat_attrmulti_by_handle(filp, arg);
+ case XFS_IOC_FSSETDM_BY_HANDLE_32:
+ return xfs_compat_fssetdm_by_handle(filp, arg);
+ default:
+ return -XFS_ERROR(ENOIOCTLCMD);
+ }
+}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
new file mode 100644
index 00000000..80f4060e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_IOCTL32_H__
+#define __XFS_IOCTL32_H__
+
+#include <linux/compat.h>
+
+/*
+ * on 32-bit arches, ioctl argument structures may have different sizes
+ * and/or alignment. We define compat structures which match the
+ * 32-bit sizes/alignments here, and their associated ioctl numbers.
+ *
+ * xfs_ioctl32.c contains routines to copy these structures in and out.
+ */
+
+/* stock kernel-level ioctls we support */
+#define XFS_IOC_GETXFLAGS_32 FS_IOC32_GETFLAGS
+#define XFS_IOC_SETXFLAGS_32 FS_IOC32_SETFLAGS
+#define XFS_IOC_GETVERSION_32 FS_IOC32_GETVERSION
+
+/*
+ * On intel, even if sizes match, alignment and/or padding may differ.
+ */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#define BROKEN_X86_ALIGNMENT
+#define __compat_packed __attribute__((packed))
+#else
+#define __compat_packed
+#endif
+
+typedef struct compat_xfs_bstime {
+ compat_time_t tv_sec; /* seconds */
+ __s32 tv_nsec; /* and nanoseconds */
+} compat_xfs_bstime_t;
+
+typedef struct compat_xfs_bstat {
+ __u64 bs_ino; /* inode number */
+ __u16 bs_mode; /* type and mode */
+ __u16 bs_nlink; /* number of links */
+ __u32 bs_uid; /* user id */
+ __u32 bs_gid; /* group id */
+ __u32 bs_rdev; /* device value */
+ __s32 bs_blksize; /* block size */
+ __s64 bs_size; /* file size */
+ compat_xfs_bstime_t bs_atime; /* access time */
+ compat_xfs_bstime_t bs_mtime; /* modify time */
+ compat_xfs_bstime_t bs_ctime; /* inode change time */
+ int64_t bs_blocks; /* number of blocks */
+ __u32 bs_xflags; /* extended flags */
+ __s32 bs_extsize; /* extent size */
+ __s32 bs_extents; /* number of extents */
+ __u32 bs_gen; /* generation count */
+ __u16 bs_projid_lo; /* lower part of project id */
+#define bs_projid bs_projid_lo /* (previously just bs_projid) */
+ __u16 bs_projid_hi; /* high part of project id */
+ unsigned char bs_pad[12]; /* pad space, unused */
+ __u32 bs_dmevmask; /* DMIG event mask */
+ __u16 bs_dmstate; /* DMIG state info */
+ __u16 bs_aextents; /* attribute number of extents */
+} __compat_packed compat_xfs_bstat_t;
+
+typedef struct compat_xfs_fsop_bulkreq {
+ compat_uptr_t lastip; /* last inode # pointer */
+ __s32 icount; /* count of entries in buffer */
+ compat_uptr_t ubuffer; /* user buffer for inode desc. */
+ compat_uptr_t ocount; /* output count pointer */
+} compat_xfs_fsop_bulkreq_t;
+
+#define XFS_IOC_FSBULKSTAT_32 \
+ _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+ _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS_32 \
+ _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+
+typedef struct compat_xfs_fsop_handlereq {
+ __u32 fd; /* fd for FD_TO_HANDLE */
+ compat_uptr_t path; /* user pathname */
+ __u32 oflags; /* open flags */
+ compat_uptr_t ihandle; /* user supplied handle */
+ __u32 ihandlen; /* user supplied length */
+ compat_uptr_t ohandle; /* user buffer for handle */
+ compat_uptr_t ohandlen; /* user buffer length */
+} compat_xfs_fsop_handlereq_t;
+
+#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+ _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE_32 \
+ _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE_32 \
+ _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE_32 \
+ _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE_32 \
+ _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
+
+/* The bstat field in the swapext struct needs translation */
+typedef struct compat_xfs_swapext {
+ __int64_t sx_version; /* version */
+ __int64_t sx_fdtarget; /* fd of target file */
+ __int64_t sx_fdtmp; /* fd of tmp file */
+ xfs_off_t sx_offset; /* offset into file */
+ xfs_off_t sx_length; /* leng from offset */
+ char sx_pad[16]; /* pad space, unused */
+ compat_xfs_bstat_t sx_stat; /* stat of target b4 copy */
+} __compat_packed compat_xfs_swapext_t;
+
+#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext)
+
+typedef struct compat_xfs_fsop_attrlist_handlereq {
+ struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+ struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */
+ __u32 flags; /* which namespace to use */
+ __u32 buflen; /* length of buffer supplied */
+ compat_uptr_t buffer; /* returned names */
+} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
+
+/* Note: actually this is read/write */
+#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
+ _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
+
+/* am_opcodes defined in xfs_fs.h */
+typedef struct compat_xfs_attr_multiop {
+ __u32 am_opcode;
+ __s32 am_error;
+ compat_uptr_t am_attrname;
+ compat_uptr_t am_attrvalue;
+ __u32 am_length;
+ __u32 am_flags;
+} compat_xfs_attr_multiop_t;
+
+typedef struct compat_xfs_fsop_attrmulti_handlereq {
+ struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+ __u32 opcount;/* count of following multiop */
+ /* ptr to compat_xfs_attr_multiop */
+ compat_uptr_t ops; /* attr_multi data */
+} compat_xfs_fsop_attrmulti_handlereq_t;
+
+#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
+ _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
+
+typedef struct compat_xfs_fsop_setdm_handlereq {
+ struct compat_xfs_fsop_handlereq hreq; /* handle information */
+ /* ptr to struct fsdmidata */
+ compat_uptr_t data; /* DMAPI data */
+} compat_xfs_fsop_setdm_handlereq_t;
+
+#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
+ _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
+
+#ifdef BROKEN_X86_ALIGNMENT
+/* on ia32 l_start is on a 32-bit boundary */
+typedef struct compat_xfs_flock64 {
+ __s16 l_type;
+ __s16 l_whence;
+ __s64 l_start __attribute__((packed));
+ /* len == 0 means until end of file */
+ __s64 l_len __attribute__((packed));
+ __s32 l_sysid;
+ __u32 l_pid;
+ __s32 l_pad[4]; /* reserve area */
+} compat_xfs_flock64_t;
+
+#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64)
+#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64)
+#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64)
+
+typedef struct compat_xfs_fsop_geom_v1 {
+ __u32 blocksize; /* filesystem (data) block size */
+ __u32 rtextsize; /* realtime extent size */
+ __u32 agblocks; /* fsblocks in an AG */
+ __u32 agcount; /* number of allocation groups */
+ __u32 logblocks; /* fsblocks in the log */
+ __u32 sectsize; /* (data) sector size, bytes */
+ __u32 inodesize; /* inode size in bytes */
+ __u32 imaxpct; /* max allowed inode space(%) */
+ __u64 datablocks; /* fsblocks in data subvolume */
+ __u64 rtblocks; /* fsblocks in realtime subvol */
+ __u64 rtextents; /* rt extents in realtime subvol*/
+ __u64 logstart; /* starting fsblock of the log */
+ unsigned char uuid[16]; /* unique id of the filesystem */
+ __u32 sunit; /* stripe unit, fsblocks */
+ __u32 swidth; /* stripe width, fsblocks */
+ __s32 version; /* structure version */
+ __u32 flags; /* superblock version flags */
+ __u32 logsectsize; /* log sector size, bytes */
+ __u32 rtsectsize; /* realtime sector size, bytes */
+ __u32 dirblocksize; /* directory block size, bytes */
+} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
+
+#define XFS_IOC_FSGEOMETRY_V1_32 \
+ _IOR('X', 100, struct compat_xfs_fsop_geom_v1)
+
+typedef struct compat_xfs_inogrp {
+ __u64 xi_startino; /* starting inode number */
+ __s32 xi_alloccount; /* # bits set in allocmask */
+ __u64 xi_allocmask; /* mask of allocated inodes */
+} __attribute__((packed)) compat_xfs_inogrp_t;
+
+/* These growfs input structures have padding on the end, so must translate */
+typedef struct compat_xfs_growfs_data {
+ __u64 newblocks; /* new data subvol size, fsblocks */
+ __u32 imaxpct; /* new inode space percentage limit */
+} __attribute__((packed)) compat_xfs_growfs_data_t;
+
+typedef struct compat_xfs_growfs_rt {
+ __u64 newblocks; /* new realtime size, fsblocks */
+ __u32 extsize; /* new realtime extent size, fsblocks */
+} __attribute__((packed)) compat_xfs_growfs_rt_t;
+
+#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
+#define XFS_IOC_FSGROWFSRT_32 _IOW('X', 112, struct compat_xfs_growfs_rt)
+
+#endif /* BROKEN_X86_ALIGNMENT */
+
+#endif /* __XFS_IOCTL32_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
new file mode 100644
index 00000000..f5b697bf
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_acl.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+#include <linux/capability.h>
+#include <linux/xattr.h>
+#include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/security.h>
+#include <linux/fiemap.h>
+#include <linux/slab.h>
+
+/*
+ * Bring the timestamps in the XFS inode uptodate.
+ *
+ * Used before writing the inode to disk.
+ */
+void
+xfs_synchronize_times(
+ xfs_inode_t *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+ ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
+ ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
+ ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
+}
+
+/*
+ * If the linux inode is valid, mark it dirty, else mark the dirty state
+ * in the XFS inode to make sure we pick it up when reclaiming the inode.
+ */
+void
+xfs_mark_inode_dirty_sync(
+ xfs_inode_t *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
+ mark_inode_dirty_sync(inode);
+ else {
+ barrier();
+ ip->i_update_core = 1;
+ }
+}
+
+void
+xfs_mark_inode_dirty(
+ xfs_inode_t *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
+ mark_inode_dirty(inode);
+ else {
+ barrier();
+ ip->i_update_core = 1;
+ }
+
+}
+
+/*
+ * Hook in SELinux. This is not quite correct yet, what we really need
+ * here (as we do for default ACLs) is a mechanism by which creation of
+ * these attrs can be journalled at inode creation time (along with the
+ * inode, of course, such that log replay can't cause these to be lost).
+ */
+STATIC int
+xfs_init_security(
+ struct inode *inode,
+ struct inode *dir,
+ const struct qstr *qstr)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ size_t length;
+ void *value;
+ unsigned char *name;
+ int error;
+
+ error = security_inode_init_security(inode, dir, qstr, (char **)&name,
+ &value, &length);
+ if (error) {
+ if (error == -EOPNOTSUPP)
+ return 0;
+ return -error;
+ }
+
+ error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
+
+ kfree(name);
+ kfree(value);
+ return error;
+}
+
+static void
+xfs_dentry_to_name(
+ struct xfs_name *namep,
+ struct dentry *dentry)
+{
+ namep->name = dentry->d_name.name;
+ namep->len = dentry->d_name.len;
+}
+
+STATIC void
+xfs_cleanup_inode(
+ struct inode *dir,
+ struct inode *inode,
+ struct dentry *dentry)
+{
+ struct xfs_name teardown;
+
+ /* Oh, the horror.
+ * If we can't add the ACL or we fail in
+ * xfs_init_security we must back out.
+ * ENOSPC can hit here, among other things.
+ */
+ xfs_dentry_to_name(&teardown, dentry);
+
+ xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
+ iput(inode);
+}
+
+STATIC int
+xfs_vn_mknod(
+ struct inode *dir,
+ struct dentry *dentry,
+ int mode,
+ dev_t rdev)
+{
+ struct inode *inode;
+ struct xfs_inode *ip = NULL;
+ struct posix_acl *default_acl = NULL;
+ struct xfs_name name;
+ int error;
+
+ /*
+ * Irix uses Missed'em'V split, but doesn't want to see
+ * the upper 5 bits of (14bit) major.
+ */
+ if (S_ISCHR(mode) || S_ISBLK(mode)) {
+ if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+ return -EINVAL;
+ rdev = sysv_encode_dev(rdev);
+ } else {
+ rdev = 0;
+ }
+
+ if (IS_POSIXACL(dir)) {
+ default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(default_acl))
+ return PTR_ERR(default_acl);
+
+ if (!default_acl)
+ mode &= ~current_umask();
+ }
+
+ xfs_dentry_to_name(&name, dentry);
+ error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+ if (unlikely(error))
+ goto out_free_acl;
+
+ inode = VFS_I(ip);
+
+ error = xfs_init_security(inode, dir, &dentry->d_name);
+ if (unlikely(error))
+ goto out_cleanup_inode;
+
+ if (default_acl) {
+ error = -xfs_inherit_acl(inode, default_acl);
+ if (unlikely(error))
+ goto out_cleanup_inode;
+ posix_acl_release(default_acl);
+ }
+
+
+ d_instantiate(dentry, inode);
+ return -error;
+
+ out_cleanup_inode:
+ xfs_cleanup_inode(dir, inode, dentry);
+ out_free_acl:
+ posix_acl_release(default_acl);
+ return -error;
+}
+
+STATIC int
+xfs_vn_create(
+ struct inode *dir,
+ struct dentry *dentry,
+ int mode,
+ struct nameidata *nd)
+{
+ return xfs_vn_mknod(dir, dentry, mode, 0);
+}
+
+STATIC int
+xfs_vn_mkdir(
+ struct inode *dir,
+ struct dentry *dentry,
+ int mode)
+{
+ return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
+}
+
+STATIC struct dentry *
+xfs_vn_lookup(
+ struct inode *dir,
+ struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct xfs_inode *cip;
+ struct xfs_name name;
+ int error;
+
+ if (dentry->d_name.len >= MAXNAMELEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ xfs_dentry_to_name(&name, dentry);
+ error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
+ if (unlikely(error)) {
+ if (unlikely(error != ENOENT))
+ return ERR_PTR(-error);
+ d_add(dentry, NULL);
+ return NULL;
+ }
+
+ return d_splice_alias(VFS_I(cip), dentry);
+}
+
+STATIC struct dentry *
+xfs_vn_ci_lookup(
+ struct inode *dir,
+ struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct xfs_inode *ip;
+ struct xfs_name xname;
+ struct xfs_name ci_name;
+ struct qstr dname;
+ int error;
+
+ if (dentry->d_name.len >= MAXNAMELEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ xfs_dentry_to_name(&xname, dentry);
+ error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
+ if (unlikely(error)) {
+ if (unlikely(error != ENOENT))
+ return ERR_PTR(-error);
+ /*
+ * call d_add(dentry, NULL) here when d_drop_negative_children
+ * is called in xfs_vn_mknod (ie. allow negative dentries
+ * with CI filesystems).
+ */
+ return NULL;
+ }
+
+ /* if exact match, just splice and exit */
+ if (!ci_name.name)
+ return d_splice_alias(VFS_I(ip), dentry);
+
+ /* else case-insensitive match... */
+ dname.name = ci_name.name;
+ dname.len = ci_name.len;
+ dentry = d_add_ci(dentry, VFS_I(ip), &dname);
+ kmem_free(ci_name.name);
+ return dentry;
+}
+
+STATIC int
+xfs_vn_link(
+ struct dentry *old_dentry,
+ struct inode *dir,
+ struct dentry *dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+ struct xfs_name name;
+ int error;
+
+ xfs_dentry_to_name(&name, dentry);
+
+ error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
+ if (unlikely(error))
+ return -error;
+
+ ihold(inode);
+ d_instantiate(dentry, inode);
+ return 0;
+}
+
+STATIC int
+xfs_vn_unlink(
+ struct inode *dir,
+ struct dentry *dentry)
+{
+ struct xfs_name name;
+ int error;
+
+ xfs_dentry_to_name(&name, dentry);
+
+ error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
+ if (error)
+ return error;
+
+ /*
+ * With unlink, the VFS makes the dentry "negative": no inode,
+ * but still hashed. This is incompatible with case-insensitive
+ * mode, so invalidate (unhash) the dentry in CI-mode.
+ */
+ if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb))
+ d_invalidate(dentry);
+ return 0;
+}
+
+STATIC int
+xfs_vn_symlink(
+ struct inode *dir,
+ struct dentry *dentry,
+ const char *symname)
+{
+ struct inode *inode;
+ struct xfs_inode *cip = NULL;
+ struct xfs_name name;
+ int error;
+ mode_t mode;
+
+ mode = S_IFLNK |
+ (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
+ xfs_dentry_to_name(&name, dentry);
+
+ error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
+ if (unlikely(error))
+ goto out;
+
+ inode = VFS_I(cip);
+
+ error = xfs_init_security(inode, dir, &dentry->d_name);
+ if (unlikely(error))
+ goto out_cleanup_inode;
+
+ d_instantiate(dentry, inode);
+ return 0;
+
+ out_cleanup_inode:
+ xfs_cleanup_inode(dir, inode, dentry);
+ out:
+ return -error;
+}
+
+STATIC int
+xfs_vn_rename(
+ struct inode *odir,
+ struct dentry *odentry,
+ struct inode *ndir,
+ struct dentry *ndentry)
+{
+ struct inode *new_inode = ndentry->d_inode;
+ struct xfs_name oname;
+ struct xfs_name nname;
+
+ xfs_dentry_to_name(&oname, odentry);
+ xfs_dentry_to_name(&nname, ndentry);
+
+ return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+ XFS_I(ndir), &nname, new_inode ?
+ XFS_I(new_inode) : NULL);
+}
+
+/*
+ * careful here - this function can get called recursively, so
+ * we need to be very careful about how much stack we use.
+ * uio is kmalloced for this reason...
+ */
+STATIC void *
+xfs_vn_follow_link(
+ struct dentry *dentry,
+ struct nameidata *nd)
+{
+ char *link;
+ int error = -ENOMEM;
+
+ link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+ if (!link)
+ goto out_err;
+
+ error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+ if (unlikely(error))
+ goto out_kfree;
+
+ nd_set_link(nd, link);
+ return NULL;
+
+ out_kfree:
+ kfree(link);
+ out_err:
+ nd_set_link(nd, ERR_PTR(error));
+ return NULL;
+}
+
+STATIC void
+xfs_vn_put_link(
+ struct dentry *dentry,
+ struct nameidata *nd,
+ void *p)
+{
+ char *s = nd_get_link(nd);
+
+ if (!IS_ERR(s))
+ kfree(s);
+}
+
+STATIC int
+xfs_vn_getattr(
+ struct vfsmount *mnt,
+ struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+
+ trace_xfs_getattr(ip);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ stat->size = XFS_ISIZE(ip);
+ stat->dev = inode->i_sb->s_dev;
+ stat->mode = ip->i_d.di_mode;
+ stat->nlink = ip->i_d.di_nlink;
+ stat->uid = ip->i_d.di_uid;
+ stat->gid = ip->i_d.di_gid;
+ stat->ino = ip->i_ino;
+ stat->atime = inode->i_atime;
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode->i_ctime;
+ stat->blocks =
+ XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
+
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFBLK:
+ case S_IFCHR:
+ stat->blksize = BLKDEV_IOSIZE;
+ stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+ sysv_minor(ip->i_df.if_u2.if_rdev));
+ break;
+ default:
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ /*
+ * If the file blocks are being allocated from a
+ * realtime volume, then return the inode's realtime
+ * extent size or the realtime volume's extent size.
+ */
+ stat->blksize =
+ xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
+ } else
+ stat->blksize = xfs_preferred_iosize(mp);
+ stat->rdev = 0;
+ break;
+ }
+
+ return 0;
+}
+
+STATIC int
+xfs_vn_setattr(
+ struct dentry *dentry,
+ struct iattr *iattr)
+{
+ return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
+}
+
+#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+/*
+ * Call fiemap helper to fill in user data.
+ * Returns positive errors to xfs_getbmap.
+ */
+STATIC int
+xfs_fiemap_format(
+ void **arg,
+ struct getbmapx *bmv,
+ int *full)
+{
+ int error;
+ struct fiemap_extent_info *fieinfo = *arg;
+ u32 fiemap_flags = 0;
+ u64 logical, physical, length;
+
+ /* Do nothing for a hole */
+ if (bmv->bmv_block == -1LL)
+ return 0;
+
+ logical = BBTOB(bmv->bmv_offset);
+ physical = BBTOB(bmv->bmv_block);
+ length = BBTOB(bmv->bmv_length);
+
+ if (bmv->bmv_oflags & BMV_OF_PREALLOC)
+ fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
+ else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
+ fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+ physical = 0; /* no block yet */
+ }
+ if (bmv->bmv_oflags & BMV_OF_LAST)
+ fiemap_flags |= FIEMAP_EXTENT_LAST;
+
+ error = fiemap_fill_next_extent(fieinfo, logical, physical,
+ length, fiemap_flags);
+ if (error > 0) {
+ error = 0;
+ *full = 1; /* user array now full */
+ }
+
+ return -error;
+}
+
+STATIC int
+xfs_vn_fiemap(
+ struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ u64 start,
+ u64 length)
+{
+ xfs_inode_t *ip = XFS_I(inode);
+ struct getbmapx bm;
+ int error;
+
+ error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
+ if (error)
+ return error;
+
+ /* Set up bmap header for xfs internal routine */
+ bm.bmv_offset = BTOBB(start);
+ /* Special case for whole file */
+ if (length == FIEMAP_MAX_OFFSET)
+ bm.bmv_length = -1LL;
+ else
+ bm.bmv_length = BTOBB(length);
+
+ /* We add one because in getbmap world count includes the header */
+ bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
+ fieinfo->fi_extents_max + 1;
+ bm.bmv_count = min_t(__s32, bm.bmv_count,
+ (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
+ bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
+ if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
+ bm.bmv_iflags |= BMV_IF_ATTRFORK;
+ if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
+ bm.bmv_iflags |= BMV_IF_DELALLOC;
+
+ error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
+ if (error)
+ return -error;
+
+ return 0;
+}
+
+static const struct inode_operations xfs_inode_operations = {
+ .check_acl = xfs_check_acl,
+ .getattr = xfs_vn_getattr,
+ .setattr = xfs_vn_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = xfs_vn_listxattr,
+ .fiemap = xfs_vn_fiemap,
+};
+
+static const struct inode_operations xfs_dir_inode_operations = {
+ .create = xfs_vn_create,
+ .lookup = xfs_vn_lookup,
+ .link = xfs_vn_link,
+ .unlink = xfs_vn_unlink,
+ .symlink = xfs_vn_symlink,
+ .mkdir = xfs_vn_mkdir,
+ /*
+ * Yes, XFS uses the same method for rmdir and unlink.
+ *
+ * There are some subtile differences deeper in the code,
+ * but we use S_ISDIR to check for those.
+ */
+ .rmdir = xfs_vn_unlink,
+ .mknod = xfs_vn_mknod,
+ .rename = xfs_vn_rename,
+ .check_acl = xfs_check_acl,
+ .getattr = xfs_vn_getattr,
+ .setattr = xfs_vn_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = xfs_vn_listxattr,
+};
+
+static const struct inode_operations xfs_dir_ci_inode_operations = {
+ .create = xfs_vn_create,
+ .lookup = xfs_vn_ci_lookup,
+ .link = xfs_vn_link,
+ .unlink = xfs_vn_unlink,
+ .symlink = xfs_vn_symlink,
+ .mkdir = xfs_vn_mkdir,
+ /*
+ * Yes, XFS uses the same method for rmdir and unlink.
+ *
+ * There are some subtile differences deeper in the code,
+ * but we use S_ISDIR to check for those.
+ */
+ .rmdir = xfs_vn_unlink,
+ .mknod = xfs_vn_mknod,
+ .rename = xfs_vn_rename,
+ .check_acl = xfs_check_acl,
+ .getattr = xfs_vn_getattr,
+ .setattr = xfs_vn_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = xfs_vn_listxattr,
+};
+
+static const struct inode_operations xfs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = xfs_vn_follow_link,
+ .put_link = xfs_vn_put_link,
+ .check_acl = xfs_check_acl,
+ .getattr = xfs_vn_getattr,
+ .setattr = xfs_vn_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = xfs_vn_listxattr,
+};
+
+STATIC void
+xfs_diflags_to_iflags(
+ struct inode *inode,
+ struct xfs_inode *ip)
+{
+ if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+ inode->i_flags |= S_IMMUTABLE;
+ else
+ inode->i_flags &= ~S_IMMUTABLE;
+ if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+ inode->i_flags |= S_APPEND;
+ else
+ inode->i_flags &= ~S_APPEND;
+ if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+ inode->i_flags |= S_SYNC;
+ else
+ inode->i_flags &= ~S_SYNC;
+ if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+ inode->i_flags |= S_NOATIME;
+ else
+ inode->i_flags &= ~S_NOATIME;
+}
+
+/*
+ * Initialize the Linux inode, set up the operation vectors and
+ * unlock the inode.
+ *
+ * When reading existing inodes from disk this is called directly
+ * from xfs_iget, when creating a new inode it is called from
+ * xfs_ialloc after setting up the inode.
+ *
+ * We are always called with an uninitialised linux inode here.
+ * We need to initialise the necessary fields and take a reference
+ * on it.
+ */
+void
+xfs_setup_inode(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = &ip->i_vnode;
+
+ inode->i_ino = ip->i_ino;
+ inode->i_state = I_NEW;
+
+ inode_sb_list_add(inode);
+ /* make the inode look hashed for the writeback code */
+ hlist_add_fake(&inode->i_hash);
+
+ inode->i_mode = ip->i_d.di_mode;
+ inode->i_nlink = ip->i_d.di_nlink;
+ inode->i_uid = ip->i_d.di_uid;
+ inode->i_gid = ip->i_d.di_gid;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFBLK:
+ case S_IFCHR:
+ inode->i_rdev =
+ MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+ sysv_minor(ip->i_df.if_u2.if_rdev));
+ break;
+ default:
+ inode->i_rdev = 0;
+ break;
+ }
+
+ inode->i_generation = ip->i_d.di_gen;
+ i_size_write(inode, ip->i_d.di_size);
+ inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec;
+ inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
+ inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
+ inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
+ inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
+ inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
+ xfs_diflags_to_iflags(inode, ip);
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &xfs_inode_operations;
+ inode->i_fop = &xfs_file_operations;
+ inode->i_mapping->a_ops = &xfs_address_space_operations;
+ break;
+ case S_IFDIR:
+ if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+ inode->i_op = &xfs_dir_ci_inode_operations;
+ else
+ inode->i_op = &xfs_dir_inode_operations;
+ inode->i_fop = &xfs_dir_file_operations;
+ break;
+ case S_IFLNK:
+ inode->i_op = &xfs_symlink_inode_operations;
+ if (!(ip->i_df.if_flags & XFS_IFINLINE))
+ inode->i_mapping->a_ops = &xfs_address_space_operations;
+ break;
+ default:
+ inode->i_op = &xfs_inode_operations;
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ break;
+ }
+
+ xfs_iflags_clear(ip, XFS_INEW);
+ barrier();
+
+ unlock_new_inode(inode);
+}
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
new file mode 100644
index 00000000..ef41c92c
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_IOPS_H__
+#define __XFS_IOPS_H__
+
+struct xfs_inode;
+
+extern const struct file_operations xfs_file_operations;
+extern const struct file_operations xfs_dir_file_operations;
+
+extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
+
+extern void xfs_setup_inode(struct xfs_inode *);
+
+#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
new file mode 100644
index 00000000..87315168
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_LINUX__
+#define __XFS_LINUX__
+
+#include <linux/types.h>
+
+/*
+ * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
+ * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
+ */
+#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
+# define XFS_BIG_BLKNOS 1
+# define XFS_BIG_INUMS 1
+#else
+# define XFS_BIG_BLKNOS 0
+# define XFS_BIG_INUMS 0
+#endif
+
+#include <xfs_types.h>
+#include <xfs_arch.h>
+
+#include <kmem.h>
+#include <mrlock.h>
+#include <time.h>
+
+#include <support/uuid.h>
+
+#include <linux/semaphore.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/major.h>
+#include <linux/pagemap.h>
+#include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/sort.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/delay.h>
+#include <linux/log2.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/ctype.h>
+#include <linux/writeback.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/list_sort.h>
+
+#include <asm/page.h>
+#include <asm/div64.h>
+#include <asm/param.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+#include <xfs_vnode.h>
+#include <xfs_stats.h>
+#include <xfs_sysctl.h>
+#include <xfs_iops.h>
+#include <xfs_aops.h>
+#include <xfs_super.h>
+#include <xfs_buf.h>
+#include <xfs_message.h>
+
+/*
+ * Feature macros (disable/enable)
+ */
+#ifdef CONFIG_SMP
+#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
+#else
+#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
+#endif
+
+#define irix_sgid_inherit xfs_params.sgid_inherit.val
+#define irix_symlink_mode xfs_params.symlink_mode.val
+#define xfs_panic_mask xfs_params.panic_mask.val
+#define xfs_error_level xfs_params.error_level.val
+#define xfs_syncd_centisecs xfs_params.syncd_timer.val
+#define xfs_stats_clear xfs_params.stats_clear.val
+#define xfs_inherit_sync xfs_params.inherit_sync.val
+#define xfs_inherit_nodump xfs_params.inherit_nodump.val
+#define xfs_inherit_noatime xfs_params.inherit_noatim.val
+#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val
+#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val
+#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
+#define xfs_rotorstep xfs_params.rotorstep.val
+#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
+#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
+
+#define current_cpu() (raw_smp_processor_id())
+#define current_pid() (current->pid)
+#define current_test_flags(f) (current->flags & (f))
+#define current_set_flags_nested(sp, f) \
+ (*(sp) = current->flags, current->flags |= (f))
+#define current_clear_flags_nested(sp, f) \
+ (*(sp) = current->flags, current->flags &= ~(f))
+#define current_restore_flags_nested(sp, f) \
+ (current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
+
+#define spinlock_destroy(lock)
+
+#define NBBY 8 /* number of bits per byte */
+
+/*
+ * Size of block device i/o is parameterized here.
+ * Currently the system supports page-sized i/o.
+ */
+#define BLKDEV_IOSHIFT PAGE_CACHE_SHIFT
+#define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT)
+/* number of BB's per block device block */
+#define BLKDEV_BB BTOBB(BLKDEV_IOSIZE)
+
+#define ENOATTR ENODATA /* Attribute not found */
+#define EWRONGFS EINVAL /* Mount with wrong filesystem type */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+
+#define SYNCHRONIZE() barrier()
+#define __return_address __builtin_return_address(0)
+
+#define XFS_PROJID_DEFAULT 0
+#define MAXPATHLEN 1024
+
+#define MIN(a,b) (min(a,b))
+#define MAX(a,b) (max(a,b))
+#define howmany(x, y) (((x)+((y)-1))/(y))
+
+/*
+ * Various platform dependent calls that don't fit anywhere else
+ */
+#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL)
+#define xfs_stack_trace() dump_stack()
+
+
+/* Move the kernel do_div definition off to one side */
+
+#if defined __i386__
+/* For ia32 we need to pull some tricks to get past various versions
+ * of the compiler which do not like us using do_div in the middle
+ * of large functions.
+ */
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+ __u32 mod;
+
+ switch (n) {
+ case 4:
+ mod = *(__u32 *)a % b;
+ *(__u32 *)a = *(__u32 *)a / b;
+ return mod;
+ case 8:
+ {
+ unsigned long __upper, __low, __high, __mod;
+ __u64 c = *(__u64 *)a;
+ __upper = __high = c >> 32;
+ __low = c;
+ if (__high) {
+ __upper = __high % (b);
+ __high = __high / (b);
+ }
+ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+ asm("":"=A" (c):"a" (__low),"d" (__high));
+ *(__u64 *)a = c;
+ return __mod;
+ }
+ }
+
+ /* NOTREACHED */
+ return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+ switch (n) {
+ case 4:
+ return *(__u32 *)a % b;
+ case 8:
+ {
+ unsigned long __upper, __low, __high, __mod;
+ __u64 c = *(__u64 *)a;
+ __upper = __high = c >> 32;
+ __low = c;
+ if (__high) {
+ __upper = __high % (b);
+ __high = __high / (b);
+ }
+ asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+ asm("":"=A" (c):"a" (__low),"d" (__high));
+ return __mod;
+ }
+ }
+
+ /* NOTREACHED */
+ return 0;
+}
+#else
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+ __u32 mod;
+
+ switch (n) {
+ case 4:
+ mod = *(__u32 *)a % b;
+ *(__u32 *)a = *(__u32 *)a / b;
+ return mod;
+ case 8:
+ mod = do_div(*(__u64 *)a, b);
+ return mod;
+ }
+
+ /* NOTREACHED */
+ return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+ switch (n) {
+ case 4:
+ return *(__u32 *)a % b;
+ case 8:
+ {
+ __u64 c = *(__u64 *)a;
+ return do_div(c, b);
+ }
+ }
+
+ /* NOTREACHED */
+ return 0;
+}
+#endif
+
+#undef do_div
+#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a))
+#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a))
+
+static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
+{
+ x += y - 1;
+ do_div(x, y);
+ return(x * y);
+}
+
+static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
+{
+ x += y - 1;
+ do_div(x, y);
+ return x;
+}
+
+/* ARM old ABI has some weird alignment/padding */
+#if defined(__arm__) && !defined(__ARM_EABI__)
+#define __arch_pack __attribute__((packed))
+#else
+#define __arch_pack
+#endif
+
+#define ASSERT_ALWAYS(expr) \
+ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+#ifndef DEBUG
+#define ASSERT(expr) ((void)0)
+
+#ifndef STATIC
+# define STATIC static noinline
+#endif
+
+#else /* DEBUG */
+
+#define ASSERT(expr) \
+ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+#ifndef STATIC
+# define STATIC noinline
+#endif
+
+#endif /* DEBUG */
+
+#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 00000000..bd672def
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+
+/*
+ * XFS logging functions
+ */
+static void
+__xfs_printk(
+ const char *level,
+ const struct xfs_mount *mp,
+ struct va_format *vaf)
+{
+ if (mp && mp->m_fsname) {
+ printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+ return;
+ }
+ printk("%sXFS: %pV\n", level, vaf);
+}
+
+#define define_xfs_printk_level(func, kern_level) \
+void func(const struct xfs_mount *mp, const char *fmt, ...) \
+{ \
+ struct va_format vaf; \
+ va_list args; \
+ \
+ va_start(args, fmt); \
+ \
+ vaf.fmt = fmt; \
+ vaf.va = &args; \
+ \
+ __xfs_printk(kern_level, mp, &vaf); \
+ va_end(args); \
+} \
+
+define_xfs_printk_level(xfs_emerg, KERN_EMERG);
+define_xfs_printk_level(xfs_alert, KERN_ALERT);
+define_xfs_printk_level(xfs_crit, KERN_CRIT);
+define_xfs_printk_level(xfs_err, KERN_ERR);
+define_xfs_printk_level(xfs_warn, KERN_WARNING);
+define_xfs_printk_level(xfs_notice, KERN_NOTICE);
+define_xfs_printk_level(xfs_info, KERN_INFO);
+#ifdef DEBUG
+define_xfs_printk_level(xfs_debug, KERN_DEBUG);
+#endif
+
+void
+xfs_alert_tag(
+ const struct xfs_mount *mp,
+ int panic_tag,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int do_panic = 0;
+
+ if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+ xfs_alert(mp, "Transforming an alert into a BUG.");
+ do_panic = 1;
+ }
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ __xfs_printk(KERN_ALERT, mp, &vaf);
+ va_end(args);
+
+ BUG_ON(do_panic);
+}
+
+void
+assfail(char *expr, char *file, int line)
+{
+ xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
+ expr, file, line);
+ BUG();
+}
+
+void
+xfs_hex_dump(void *p, int length)
+{
+ print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 00000000..7fb7ea00
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,39 @@
+#ifndef __XFS_MESSAGE_H
+#define __XFS_MESSAGE_H 1
+
+struct xfs_mount;
+
+extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
+ const char *fmt, ...)
+ __attribute__ ((format (printf, 3, 4)));
+extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+
+#ifdef DEBUG
+extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+#else
+static inline void
+__attribute__ ((format (printf, 2, 3)))
+xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+{
+}
+#endif
+
+extern void assfail(char *expr, char *f, int l);
+
+extern void xfs_hex_dump(void *p, int length);
+
+#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
new file mode 100644
index 00000000..29b9d642
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "quota/xfs_qm.h"
+#include <linux/quota.h>
+
+
+STATIC int
+xfs_quota_type(int type)
+{
+ switch (type) {
+ case USRQUOTA:
+ return XFS_DQ_USER;
+ case GRPQUOTA:
+ return XFS_DQ_GROUP;
+ default:
+ return XFS_DQ_PROJ;
+ }
+}
+
+STATIC int
+xfs_fs_get_xstate(
+ struct super_block *sb,
+ struct fs_quota_stat *fqs)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ return -xfs_qm_scall_getqstat(mp, fqs);
+}
+
+STATIC int
+xfs_fs_set_xstate(
+ struct super_block *sb,
+ unsigned int uflags,
+ int op)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+ unsigned int flags = 0;
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+ if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+
+ if (uflags & FS_QUOTA_UDQ_ACCT)
+ flags |= XFS_UQUOTA_ACCT;
+ if (uflags & FS_QUOTA_PDQ_ACCT)
+ flags |= XFS_PQUOTA_ACCT;
+ if (uflags & FS_QUOTA_GDQ_ACCT)
+ flags |= XFS_GQUOTA_ACCT;
+ if (uflags & FS_QUOTA_UDQ_ENFD)
+ flags |= XFS_UQUOTA_ENFD;
+ if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
+ flags |= XFS_OQUOTA_ENFD;
+
+ switch (op) {
+ case Q_XQUOTAON:
+ return -xfs_qm_scall_quotaon(mp, flags);
+ case Q_XQUOTAOFF:
+ if (!XFS_IS_QUOTA_ON(mp))
+ return -EINVAL;
+ return -xfs_qm_scall_quotaoff(mp, flags);
+ case Q_XQUOTARM:
+ if (XFS_IS_QUOTA_ON(mp))
+ return -EINVAL;
+ return -xfs_qm_scall_trunc_qfiles(mp, flags);
+ }
+
+ return -EINVAL;
+}
+
+STATIC int
+xfs_fs_get_dqblk(
+ struct super_block *sb,
+ int type,
+ qid_t id,
+ struct fs_disk_quota *fdq)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ if (!XFS_IS_QUOTA_ON(mp))
+ return -ESRCH;
+
+ return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
+}
+
+STATIC int
+xfs_fs_set_dqblk(
+ struct super_block *sb,
+ int type,
+ qid_t id,
+ struct fs_disk_quota *fdq)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ if (!XFS_IS_QUOTA_ON(mp))
+ return -ESRCH;
+
+ return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
+}
+
+const struct quotactl_ops xfs_quotactl_operations = {
+ .get_xstate = xfs_fs_get_xstate,
+ .set_xstate = xfs_fs_set_xstate,
+ .get_dqblk = xfs_fs_get_dqblk,
+ .set_dqblk = xfs_fs_set_dqblk,
+};
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
new file mode 100644
index 00000000..76fdc586
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include <linux/proc_fs.h>
+
+DEFINE_PER_CPU(struct xfsstats, xfsstats);
+
+static int xfs_stat_proc_show(struct seq_file *m, void *v)
+{
+ int c, i, j, val;
+ __uint64_t xs_xstrat_bytes = 0;
+ __uint64_t xs_write_bytes = 0;
+ __uint64_t xs_read_bytes = 0;
+
+ static const struct xstats_entry {
+ char *desc;
+ int endpoint;
+ } xstats[] = {
+ { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC },
+ { "abt", XFSSTAT_END_ALLOC_BTREE },
+ { "blk_map", XFSSTAT_END_BLOCK_MAPPING },
+ { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE },
+ { "dir", XFSSTAT_END_DIRECTORY_OPS },
+ { "trans", XFSSTAT_END_TRANSACTIONS },
+ { "ig", XFSSTAT_END_INODE_OPS },
+ { "log", XFSSTAT_END_LOG_OPS },
+ { "push_ail", XFSSTAT_END_TAIL_PUSHING },
+ { "xstrat", XFSSTAT_END_WRITE_CONVERT },
+ { "rw", XFSSTAT_END_READ_WRITE_OPS },
+ { "attr", XFSSTAT_END_ATTRIBUTE_OPS },
+ { "icluster", XFSSTAT_END_INODE_CLUSTER },
+ { "vnodes", XFSSTAT_END_VNODE_OPS },
+ { "buf", XFSSTAT_END_BUF },
+ { "abtb2", XFSSTAT_END_ABTB_V2 },
+ { "abtc2", XFSSTAT_END_ABTC_V2 },
+ { "bmbt2", XFSSTAT_END_BMBT_V2 },
+ { "ibt2", XFSSTAT_END_IBT_V2 },
+ };
+
+ /* Loop over all stats groups */
+ for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+ seq_printf(m, "%s", xstats[i].desc);
+ /* inner loop does each group */
+ while (j < xstats[i].endpoint) {
+ val = 0;
+ /* sum over all cpus */
+ for_each_possible_cpu(c)
+ val += *(((__u32*)&per_cpu(xfsstats, c) + j));
+ seq_printf(m, " %u", val);
+ j++;
+ }
+ seq_putc(m, '\n');
+ }
+ /* extra precision counters */
+ for_each_possible_cpu(i) {
+ xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
+ xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
+ xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+ }
+
+ seq_printf(m, "xpc %Lu %Lu %Lu\n",
+ xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
+ seq_printf(m, "debug %u\n",
+#if defined(DEBUG)
+ 1);
+#else
+ 0);
+#endif
+ return 0;
+}
+
+static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xfs_stat_proc_show, NULL);
+}
+
+static const struct file_operations xfs_stat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xfs_stat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+int
+xfs_init_procfs(void)
+{
+ if (!proc_mkdir("fs/xfs", NULL))
+ goto out;
+
+ if (!proc_create("fs/xfs/stat", 0, NULL,
+ &xfs_stat_proc_fops))
+ goto out_remove_entry;
+ return 0;
+
+ out_remove_entry:
+ remove_proc_entry("fs/xfs", NULL);
+ out:
+ return -ENOMEM;
+}
+
+void
+xfs_cleanup_procfs(void)
+{
+ remove_proc_entry("fs/xfs/stat", NULL);
+ remove_proc_entry("fs/xfs", NULL);
+}
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
new file mode 100644
index 00000000..736854b1
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_STATS_H__
+#define __XFS_STATS_H__
+
+
+#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
+
+#include <linux/percpu.h>
+
+/*
+ * XFS global statistics
+ */
+struct xfsstats {
+# define XFSSTAT_END_EXTENT_ALLOC 4
+ __uint32_t xs_allocx;
+ __uint32_t xs_allocb;
+ __uint32_t xs_freex;
+ __uint32_t xs_freeb;
+# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4)
+ __uint32_t xs_abt_lookup;
+ __uint32_t xs_abt_compare;
+ __uint32_t xs_abt_insrec;
+ __uint32_t xs_abt_delrec;
+# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7)
+ __uint32_t xs_blk_mapr;
+ __uint32_t xs_blk_mapw;
+ __uint32_t xs_blk_unmap;
+ __uint32_t xs_add_exlist;
+ __uint32_t xs_del_exlist;
+ __uint32_t xs_look_exlist;
+ __uint32_t xs_cmp_exlist;
+# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4)
+ __uint32_t xs_bmbt_lookup;
+ __uint32_t xs_bmbt_compare;
+ __uint32_t xs_bmbt_insrec;
+ __uint32_t xs_bmbt_delrec;
+# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4)
+ __uint32_t xs_dir_lookup;
+ __uint32_t xs_dir_create;
+ __uint32_t xs_dir_remove;
+ __uint32_t xs_dir_getdents;
+# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3)
+ __uint32_t xs_trans_sync;
+ __uint32_t xs_trans_async;
+ __uint32_t xs_trans_empty;
+# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7)
+ __uint32_t xs_ig_attempts;
+ __uint32_t xs_ig_found;
+ __uint32_t xs_ig_frecycle;
+ __uint32_t xs_ig_missed;
+ __uint32_t xs_ig_dup;
+ __uint32_t xs_ig_reclaims;
+ __uint32_t xs_ig_attrchg;
+# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5)
+ __uint32_t xs_log_writes;
+ __uint32_t xs_log_blocks;
+ __uint32_t xs_log_noiclogs;
+ __uint32_t xs_log_force;
+ __uint32_t xs_log_force_sleep;
+# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10)
+ __uint32_t xs_try_logspace;
+ __uint32_t xs_sleep_logspace;
+ __uint32_t xs_push_ail;
+ __uint32_t xs_push_ail_success;
+ __uint32_t xs_push_ail_pushbuf;
+ __uint32_t xs_push_ail_pinned;
+ __uint32_t xs_push_ail_locked;
+ __uint32_t xs_push_ail_flushing;
+ __uint32_t xs_push_ail_restarts;
+ __uint32_t xs_push_ail_flush;
+# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2)
+ __uint32_t xs_xstrat_quick;
+ __uint32_t xs_xstrat_split;
+# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2)
+ __uint32_t xs_write_calls;
+ __uint32_t xs_read_calls;
+# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4)
+ __uint32_t xs_attr_get;
+ __uint32_t xs_attr_set;
+ __uint32_t xs_attr_remove;
+ __uint32_t xs_attr_list;
+# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3)
+ __uint32_t xs_iflush_count;
+ __uint32_t xs_icluster_flushcnt;
+ __uint32_t xs_icluster_flushinode;
+# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8)
+ __uint32_t vn_active; /* # vnodes not on free lists */
+ __uint32_t vn_alloc; /* # times vn_alloc called */
+ __uint32_t vn_get; /* # times vn_get called */
+ __uint32_t vn_hold; /* # times vn_hold called */
+ __uint32_t vn_rele; /* # times vn_rele called */
+ __uint32_t vn_reclaim; /* # times vn_reclaim called */
+ __uint32_t vn_remove; /* # times vn_remove called */
+ __uint32_t vn_free; /* # times vn_free called */
+#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9)
+ __uint32_t xb_get;
+ __uint32_t xb_create;
+ __uint32_t xb_get_locked;
+ __uint32_t xb_get_locked_waited;
+ __uint32_t xb_busy_locked;
+ __uint32_t xb_miss_locked;
+ __uint32_t xb_page_retries;
+ __uint32_t xb_page_found;
+ __uint32_t xb_get_read;
+/* Version 2 btree counters */
+#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15)
+ __uint32_t xs_abtb_2_lookup;
+ __uint32_t xs_abtb_2_compare;
+ __uint32_t xs_abtb_2_insrec;
+ __uint32_t xs_abtb_2_delrec;
+ __uint32_t xs_abtb_2_newroot;
+ __uint32_t xs_abtb_2_killroot;
+ __uint32_t xs_abtb_2_increment;
+ __uint32_t xs_abtb_2_decrement;
+ __uint32_t xs_abtb_2_lshift;
+ __uint32_t xs_abtb_2_rshift;
+ __uint32_t xs_abtb_2_split;
+ __uint32_t xs_abtb_2_join;
+ __uint32_t xs_abtb_2_alloc;
+ __uint32_t xs_abtb_2_free;
+ __uint32_t xs_abtb_2_moves;
+#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15)
+ __uint32_t xs_abtc_2_lookup;
+ __uint32_t xs_abtc_2_compare;
+ __uint32_t xs_abtc_2_insrec;
+ __uint32_t xs_abtc_2_delrec;
+ __uint32_t xs_abtc_2_newroot;
+ __uint32_t xs_abtc_2_killroot;
+ __uint32_t xs_abtc_2_increment;
+ __uint32_t xs_abtc_2_decrement;
+ __uint32_t xs_abtc_2_lshift;
+ __uint32_t xs_abtc_2_rshift;
+ __uint32_t xs_abtc_2_split;
+ __uint32_t xs_abtc_2_join;
+ __uint32_t xs_abtc_2_alloc;
+ __uint32_t xs_abtc_2_free;
+ __uint32_t xs_abtc_2_moves;
+#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15)
+ __uint32_t xs_bmbt_2_lookup;
+ __uint32_t xs_bmbt_2_compare;
+ __uint32_t xs_bmbt_2_insrec;
+ __uint32_t xs_bmbt_2_delrec;
+ __uint32_t xs_bmbt_2_newroot;
+ __uint32_t xs_bmbt_2_killroot;
+ __uint32_t xs_bmbt_2_increment;
+ __uint32_t xs_bmbt_2_decrement;
+ __uint32_t xs_bmbt_2_lshift;
+ __uint32_t xs_bmbt_2_rshift;
+ __uint32_t xs_bmbt_2_split;
+ __uint32_t xs_bmbt_2_join;
+ __uint32_t xs_bmbt_2_alloc;
+ __uint32_t xs_bmbt_2_free;
+ __uint32_t xs_bmbt_2_moves;
+#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15)
+ __uint32_t xs_ibt_2_lookup;
+ __uint32_t xs_ibt_2_compare;
+ __uint32_t xs_ibt_2_insrec;
+ __uint32_t xs_ibt_2_delrec;
+ __uint32_t xs_ibt_2_newroot;
+ __uint32_t xs_ibt_2_killroot;
+ __uint32_t xs_ibt_2_increment;
+ __uint32_t xs_ibt_2_decrement;
+ __uint32_t xs_ibt_2_lshift;
+ __uint32_t xs_ibt_2_rshift;
+ __uint32_t xs_ibt_2_split;
+ __uint32_t xs_ibt_2_join;
+ __uint32_t xs_ibt_2_alloc;
+ __uint32_t xs_ibt_2_free;
+ __uint32_t xs_ibt_2_moves;
+/* Extra precision counters */
+ __uint64_t xs_xstrat_bytes;
+ __uint64_t xs_write_bytes;
+ __uint64_t xs_read_bytes;
+};
+
+DECLARE_PER_CPU(struct xfsstats, xfsstats);
+
+/*
+ * We don't disable preempt, not too worried about poking the
+ * wrong CPU's stat for now (also aggregated before reporting).
+ */
+#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++)
+#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--)
+#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc))
+
+extern int xfs_init_procfs(void);
+extern void xfs_cleanup_procfs(void);
+
+
+#else /* !CONFIG_PROC_FS */
+
+# define XFS_STATS_INC(count)
+# define XFS_STATS_DEC(count)
+# define XFS_STATS_ADD(count, inc)
+
+static inline int xfs_init_procfs(void)
+{
+ return 0;
+}
+
+static inline void xfs_cleanup_procfs(void)
+{
+}
+
+#endif /* !CONFIG_PROC_FS */
+
+#endif /* __XFS_STATS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
new file mode 100644
index 00000000..e6ac98c1
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -0,0 +1,1720 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xfs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_ialloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_fsops.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_vnodeops.h"
+#include "xfs_log_priv.h"
+#include "xfs_trans_priv.h"
+#include "xfs_filestream.h"
+#include "xfs_da_btree.h"
+#include "xfs_extfree_item.h"
+#include "xfs_mru_cache.h"
+#include "xfs_inode_item.h"
+#include "xfs_sync.h"
+#include "xfs_trace.h"
+
+#include <linux/namei.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/mempool.h>
+#include <linux/writeback.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/parser.h>
+
+static const struct super_operations xfs_super_operations;
+static kmem_zone_t *xfs_ioend_zone;
+mempool_t *xfs_ioend_pool;
+
+#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */
+#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */
+#define MNTOPT_LOGDEV "logdev" /* log device */
+#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */
+#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */
+#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */
+#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */
+#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */
+#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */
+#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */
+#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */
+#define MNTOPT_MTPT "mtpt" /* filesystem mount point */
+#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */
+#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */
+#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */
+#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */
+#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */
+#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */
+#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and
+ * unwritten extent conversion */
+#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */
+#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
+#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
+#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */
+#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */
+#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes
+ * in stat(). */
+#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */
+#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */
+#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */
+#define MNTOPT_QUOTA "quota" /* disk quotas (user) */
+#define MNTOPT_NOQUOTA "noquota" /* no quotas */
+#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */
+#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */
+#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */
+#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */
+#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */
+#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */
+#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
+#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
+#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
+#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
+#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */
+#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */
+#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
+#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
+
+/*
+ * Table driven mount option parser.
+ *
+ * Currently only used for remount, but it will be used for mount
+ * in the future, too.
+ */
+enum {
+ Opt_barrier, Opt_nobarrier, Opt_err
+};
+
+static const match_table_t tokens = {
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
+ {Opt_err, NULL}
+};
+
+
+STATIC unsigned long
+suffix_strtoul(char *s, char **endp, unsigned int base)
+{
+ int last, shift_left_factor = 0;
+ char *value = s;
+
+ last = strlen(value) - 1;
+ if (value[last] == 'K' || value[last] == 'k') {
+ shift_left_factor = 10;
+ value[last] = '\0';
+ }
+ if (value[last] == 'M' || value[last] == 'm') {
+ shift_left_factor = 20;
+ value[last] = '\0';
+ }
+ if (value[last] == 'G' || value[last] == 'g') {
+ shift_left_factor = 30;
+ value[last] = '\0';
+ }
+
+ return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
+}
+
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ *
+ * Note that this function leaks the various device name allocations on
+ * failure. The caller takes care of them.
+ */
+STATIC int
+xfs_parseargs(
+ struct xfs_mount *mp,
+ char *options)
+{
+ struct super_block *sb = mp->m_super;
+ char *this_char, *value, *eov;
+ int dsunit = 0;
+ int dswidth = 0;
+ int iosize = 0;
+ __uint8_t iosizelog = 0;
+
+ /*
+ * set up the mount name first so all the errors will refer to the
+ * correct device.
+ */
+ mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+ if (!mp->m_fsname)
+ return ENOMEM;
+ mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+
+ /*
+ * Copy binary VFS mount flags we are interested in.
+ */
+ if (sb->s_flags & MS_RDONLY)
+ mp->m_flags |= XFS_MOUNT_RDONLY;
+ if (sb->s_flags & MS_DIRSYNC)
+ mp->m_flags |= XFS_MOUNT_DIRSYNC;
+ if (sb->s_flags & MS_SYNCHRONOUS)
+ mp->m_flags |= XFS_MOUNT_WSYNC;
+
+ /*
+ * Set some default flags that could be cleared by the mount option
+ * parsing.
+ */
+ mp->m_flags |= XFS_MOUNT_BARRIER;
+ mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+ mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+ mp->m_flags |= XFS_MOUNT_DELAYLOG;
+
+ /*
+ * These can be overridden by the mount option parsing.
+ */
+ mp->m_logbufs = -1;
+ mp->m_logbsize = -1;
+
+ if (!options)
+ goto done;
+
+ while ((this_char = strsep(&options, ",")) != NULL) {
+ if (!*this_char)
+ continue;
+ if ((value = strchr(this_char, '=')) != NULL)
+ *value++ = 0;
+
+ if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
+ if (!value || !*value) {
+ xfs_warn(mp, "%s option requires an argument",
+ this_char);
+ return EINVAL;
+ }
+ mp->m_logbufs = simple_strtoul(value, &eov, 10);
+ } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
+ if (!value || !*value) {
+ xfs_warn(mp, "%s option requires an argument",
+ this_char);
+ return EINVAL;
+ }
+ mp->m_logbsize = suffix_strtoul(value, &eov, 10);
+ } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
+ if (!value || !*value) {
+ xfs_warn(mp, "%s option requires an argument",
+ this_char);
+ return EINVAL;
+ }
+ mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+ if (!mp->m_logname)
+ return ENOMEM;
+ } else if (!strcmp(this_char, MNTOPT_MTPT)) {
+ xfs_warn(mp, "%s option not allowed on this system",
+ this_char);
+ return EINVAL;
+ } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
+ if (!value || !*value) {
+ xfs_warn(mp, "%s option requires an argument",
+ this_char);
+ return EINVAL;
+ }
+ mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+ if (!mp->m_rtname)
+ return ENOMEM;
+ } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
+ if (!value || !*value) {
+ xfs_warn(mp, "%s option requires an argument",
+ this_char);
+ return EINVAL;
+ }
+ iosize = simple_strtoul(value, &eov, 10);
+ iosizelog = ffs(iosize) - 1;
+ } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
+ if (!value || !*value) {
+ xfs_warn(mp, "%s option requires an argument",
+ this_char);
+ return EINVAL;
+ }
+ iosize = suffix_strtoul(value, &eov, 10);
+ iosizelog = ffs(iosize) - 1;
+ } else if (!strcmp(this_char, MNTOPT_GRPID) ||
+ !strcmp(this_char, MNTOPT_BSDGROUPS)) {
+ mp->m_flags |= XFS_MOUNT_GRPID;
+ } else if (!strcmp(this_char, MNTOPT_NOGRPID) ||
+ !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
+ mp->m_flags &= ~XFS_MOUNT_GRPID;
+ } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
+ mp->m_flags |= XFS_MOUNT_WSYNC;
+ } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
+ mp->m_flags |= XFS_MOUNT_NORECOVERY;
+ } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
+ mp->m_flags |= XFS_MOUNT_NOALIGN;
+ } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
+ mp->m_flags |= XFS_MOUNT_SWALLOC;
+ } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
+ if (!value || !*value) {
+ xfs_warn(mp, "%s option requires an argument",
+ this_char);
+ return EINVAL;
+ }
+ dsunit = simple_strtoul(value, &eov, 10);
+ } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
+ if (!value || !*value) {
+ xfs_warn(mp, "%s option requires an argument",
+ this_char);
+ return EINVAL;
+ }
+ dswidth = simple_strtoul(value, &eov, 10);
+ } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
+ mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+#if !XFS_BIG_INUMS
+ xfs_warn(mp, "%s option not allowed on this system",
+ this_char);
+ return EINVAL;
+#endif
+ } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
+ mp->m_flags |= XFS_MOUNT_NOUUID;
+ } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
+ mp->m_flags |= XFS_MOUNT_BARRIER;
+ } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
+ mp->m_flags &= ~XFS_MOUNT_BARRIER;
+ } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
+ mp->m_flags |= XFS_MOUNT_IKEEP;
+ } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
+ mp->m_flags &= ~XFS_MOUNT_IKEEP;
+ } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
+ mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
+ } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
+ mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+ } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
+ mp->m_flags |= XFS_MOUNT_ATTR2;
+ } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
+ mp->m_flags &= ~XFS_MOUNT_ATTR2;
+ mp->m_flags |= XFS_MOUNT_NOATTR2;
+ } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
+ mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+ } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
+ mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+ XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+ XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+ XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
+ } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
+ !strcmp(this_char, MNTOPT_UQUOTA) ||
+ !strcmp(this_char, MNTOPT_USRQUOTA)) {
+ mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+ XFS_UQUOTA_ENFD);
+ } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
+ !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
+ mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_UQUOTA_ENFD;
+ } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
+ !strcmp(this_char, MNTOPT_PRJQUOTA)) {
+ mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+ XFS_OQUOTA_ENFD);
+ } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
+ mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+ } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
+ !strcmp(this_char, MNTOPT_GRPQUOTA)) {
+ mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+ XFS_OQUOTA_ENFD);
+ } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
+ mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+ } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
+ mp->m_flags |= XFS_MOUNT_DELAYLOG;
+ } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
+ mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
+ } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+ mp->m_flags |= XFS_MOUNT_DISCARD;
+ } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+ mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ } else if (!strcmp(this_char, "ihashsize")) {
+ xfs_warn(mp,
+ "ihashsize no longer used, option is deprecated.");
+ } else if (!strcmp(this_char, "osyncisdsync")) {
+ xfs_warn(mp,
+ "osyncisdsync has no effect, option is deprecated.");
+ } else if (!strcmp(this_char, "osyncisosync")) {
+ xfs_warn(mp,
+ "osyncisosync has no effect, option is deprecated.");
+ } else if (!strcmp(this_char, "irixsgid")) {
+ xfs_warn(mp,
+ "irixsgid is now a sysctl(2) variable, option is deprecated.");
+ } else {
+ xfs_warn(mp, "unknown mount option [%s].", this_char);
+ return EINVAL;
+ }
+ }
+
+ /*
+ * no recovery flag requires a read-only mount
+ */
+ if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
+ !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ xfs_warn(mp, "no-recovery mounts must be read-only.");
+ return EINVAL;
+ }
+
+ if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
+ xfs_warn(mp,
+ "sunit and swidth options incompatible with the noalign option");
+ return EINVAL;
+ }
+
+ if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
+ !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
+ xfs_warn(mp,
+ "the discard option is incompatible with the nodelaylog option");
+ return EINVAL;
+ }
+
+#ifndef CONFIG_XFS_QUOTA
+ if (XFS_IS_QUOTA_RUNNING(mp)) {
+ xfs_warn(mp, "quota support not available in this kernel.");
+ return EINVAL;
+ }
+#endif
+
+ if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+ (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
+ xfs_warn(mp, "cannot mount with both project and group quota");
+ return EINVAL;
+ }
+
+ if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
+ xfs_warn(mp, "sunit and swidth must be specified together");
+ return EINVAL;
+ }
+
+ if (dsunit && (dswidth % dsunit != 0)) {
+ xfs_warn(mp,
+ "stripe width (%d) must be a multiple of the stripe unit (%d)",
+ dswidth, dsunit);
+ return EINVAL;
+ }
+
+done:
+ if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+ /*
+ * At this point the superblock has not been read
+ * in, therefore we do not know the block size.
+ * Before the mount call ends we will convert
+ * these to FSBs.
+ */
+ if (dsunit) {
+ mp->m_dalign = dsunit;
+ mp->m_flags |= XFS_MOUNT_RETERR;
+ }
+
+ if (dswidth)
+ mp->m_swidth = dswidth;
+ }
+
+ if (mp->m_logbufs != -1 &&
+ mp->m_logbufs != 0 &&
+ (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+ mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+ xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
+ mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+ return XFS_ERROR(EINVAL);
+ }
+ if (mp->m_logbsize != -1 &&
+ mp->m_logbsize != 0 &&
+ (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+ mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+ !is_power_of_2(mp->m_logbsize))) {
+ xfs_warn(mp,
+ "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+ mp->m_logbsize);
+ return XFS_ERROR(EINVAL);
+ }
+
+ if (iosizelog) {
+ if (iosizelog > XFS_MAX_IO_LOG ||
+ iosizelog < XFS_MIN_IO_LOG) {
+ xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
+ iosizelog, XFS_MIN_IO_LOG,
+ XFS_MAX_IO_LOG);
+ return XFS_ERROR(EINVAL);
+ }
+
+ mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+ mp->m_readio_log = iosizelog;
+ mp->m_writeio_log = iosizelog;
+ }
+
+ return 0;
+}
+
+struct proc_xfs_info {
+ int flag;
+ char *str;
+};
+
+STATIC int
+xfs_showargs(
+ struct xfs_mount *mp,
+ struct seq_file *m)
+{
+ static struct proc_xfs_info xfs_info_set[] = {
+ /* the few simple ones we can get from the mount struct */
+ { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP },
+ { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC },
+ { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN },
+ { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC },
+ { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
+ { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY },
+ { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 },
+ { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
+ { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
+ { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
+ { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
+ { 0, NULL }
+ };
+ static struct proc_xfs_info xfs_info_unset[] = {
+ /* the few simple ones we can get from the mount struct */
+ { XFS_MOUNT_COMPAT_IOSIZE, "," MNTOPT_LARGEIO },
+ { XFS_MOUNT_BARRIER, "," MNTOPT_NOBARRIER },
+ { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_64BITINODE },
+ { 0, NULL }
+ };
+ struct proc_xfs_info *xfs_infop;
+
+ for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
+ if (mp->m_flags & xfs_infop->flag)
+ seq_puts(m, xfs_infop->str);
+ }
+ for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) {
+ if (!(mp->m_flags & xfs_infop->flag))
+ seq_puts(m, xfs_infop->str);
+ }
+
+ if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+ seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
+ (int)(1 << mp->m_writeio_log) >> 10);
+
+ if (mp->m_logbufs > 0)
+ seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
+ if (mp->m_logbsize > 0)
+ seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
+
+ if (mp->m_logname)
+ seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+ if (mp->m_rtname)
+ seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+
+ if (mp->m_dalign > 0)
+ seq_printf(m, "," MNTOPT_SUNIT "=%d",
+ (int)XFS_FSB_TO_BB(mp, mp->m_dalign));
+ if (mp->m_swidth > 0)
+ seq_printf(m, "," MNTOPT_SWIDTH "=%d",
+ (int)XFS_FSB_TO_BB(mp, mp->m_swidth));
+
+ if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
+ seq_puts(m, "," MNTOPT_USRQUOTA);
+ else if (mp->m_qflags & XFS_UQUOTA_ACCT)
+ seq_puts(m, "," MNTOPT_UQUOTANOENF);
+
+ /* Either project or group quotas can be active, not both */
+
+ if (mp->m_qflags & XFS_PQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ seq_puts(m, "," MNTOPT_PRJQUOTA);
+ else
+ seq_puts(m, "," MNTOPT_PQUOTANOENF);
+ } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ seq_puts(m, "," MNTOPT_GRPQUOTA);
+ else
+ seq_puts(m, "," MNTOPT_GQUOTANOENF);
+ }
+
+ if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
+ seq_puts(m, "," MNTOPT_NOQUOTA);
+
+ return 0;
+}
+__uint64_t
+xfs_max_file_offset(
+ unsigned int blockshift)
+{
+ unsigned int pagefactor = 1;
+ unsigned int bitshift = BITS_PER_LONG - 1;
+
+ /* Figure out maximum filesize, on Linux this can depend on
+ * the filesystem blocksize (on 32 bit platforms).
+ * __block_write_begin does this in an [unsigned] long...
+ * page->index << (PAGE_CACHE_SHIFT - bbits)
+ * So, for page sized blocks (4K on 32 bit platforms),
+ * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
+ * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+ * but for smaller blocksizes it is less (bbits = log2 bsize).
+ * Note1: get_block_t takes a long (implicit cast from above)
+ * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
+ * can optionally convert the [unsigned] long from above into
+ * an [unsigned] long long.
+ */
+
+#if BITS_PER_LONG == 32
+# if defined(CONFIG_LBDAF)
+ ASSERT(sizeof(sector_t) == 8);
+ pagefactor = PAGE_CACHE_SIZE;
+ bitshift = BITS_PER_LONG;
+# else
+ pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+# endif
+#endif
+
+ return (((__uint64_t)pagefactor) << bitshift) - 1;
+}
+
+STATIC int
+xfs_blkdev_get(
+ xfs_mount_t *mp,
+ const char *name,
+ struct block_device **bdevp)
+{
+ int error = 0;
+
+ *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+ mp);
+ if (IS_ERR(*bdevp)) {
+ error = PTR_ERR(*bdevp);
+ xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
+ }
+
+ return -error;
+}
+
+STATIC void
+xfs_blkdev_put(
+ struct block_device *bdev)
+{
+ if (bdev)
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+void
+xfs_blkdev_issue_flush(
+ xfs_buftarg_t *buftarg)
+{
+ blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
+}
+
+STATIC void
+xfs_close_devices(
+ struct xfs_mount *mp)
+{
+ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+ struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
+ xfs_free_buftarg(mp, mp->m_logdev_targp);
+ xfs_blkdev_put(logdev);
+ }
+ if (mp->m_rtdev_targp) {
+ struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
+ xfs_free_buftarg(mp, mp->m_rtdev_targp);
+ xfs_blkdev_put(rtdev);
+ }
+ xfs_free_buftarg(mp, mp->m_ddev_targp);
+}
+
+/*
+ * The file system configurations are:
+ * (1) device (partition) with data and internal log
+ * (2) logical volume with data and log subvolumes.
+ * (3) logical volume with data, log, and realtime subvolumes.
+ *
+ * We only have to handle opening the log and realtime volumes here if
+ * they are present. The data subvolume has already been opened by
+ * get_sb_bdev() and is stored in sb->s_bdev.
+ */
+STATIC int
+xfs_open_devices(
+ struct xfs_mount *mp)
+{
+ struct block_device *ddev = mp->m_super->s_bdev;
+ struct block_device *logdev = NULL, *rtdev = NULL;
+ int error;
+
+ /*
+ * Open real time and log devices - order is important.
+ */
+ if (mp->m_logname) {
+ error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
+ if (error)
+ goto out;
+ }
+
+ if (mp->m_rtname) {
+ error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
+ if (error)
+ goto out_close_logdev;
+
+ if (rtdev == ddev || rtdev == logdev) {
+ xfs_warn(mp,
+ "Cannot mount filesystem with identical rtdev and ddev/logdev.");
+ error = EINVAL;
+ goto out_close_rtdev;
+ }
+ }
+
+ /*
+ * Setup xfs_mount buffer target pointers
+ */
+ error = ENOMEM;
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
+ if (!mp->m_ddev_targp)
+ goto out_close_rtdev;
+
+ if (rtdev) {
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
+ mp->m_fsname);
+ if (!mp->m_rtdev_targp)
+ goto out_free_ddev_targ;
+ }
+
+ if (logdev && logdev != ddev) {
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
+ mp->m_fsname);
+ if (!mp->m_logdev_targp)
+ goto out_free_rtdev_targ;
+ } else {
+ mp->m_logdev_targp = mp->m_ddev_targp;
+ }
+
+ return 0;
+
+ out_free_rtdev_targ:
+ if (mp->m_rtdev_targp)
+ xfs_free_buftarg(mp, mp->m_rtdev_targp);
+ out_free_ddev_targ:
+ xfs_free_buftarg(mp, mp->m_ddev_targp);
+ out_close_rtdev:
+ if (rtdev)
+ xfs_blkdev_put(rtdev);
+ out_close_logdev:
+ if (logdev && logdev != ddev)
+ xfs_blkdev_put(logdev);
+ out:
+ return error;
+}
+
+/*
+ * Setup xfs_mount buffer target pointers based on superblock
+ */
+STATIC int
+xfs_setup_devices(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
+ mp->m_sb.sb_sectsize);
+ if (error)
+ return error;
+
+ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+ unsigned int log_sector_size = BBSIZE;
+
+ if (xfs_sb_version_hassector(&mp->m_sb))
+ log_sector_size = mp->m_sb.sb_logsectsize;
+ error = xfs_setsize_buftarg(mp->m_logdev_targp,
+ mp->m_sb.sb_blocksize,
+ log_sector_size);
+ if (error)
+ return error;
+ }
+ if (mp->m_rtdev_targp) {
+ error = xfs_setsize_buftarg(mp->m_rtdev_targp,
+ mp->m_sb.sb_blocksize,
+ mp->m_sb.sb_sectsize);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Catch misguided souls that try to use this interface on XFS */
+STATIC struct inode *
+xfs_fs_alloc_inode(
+ struct super_block *sb)
+{
+ BUG();
+ return NULL;
+}
+
+/*
+ * Now that the generic code is guaranteed not to be accessing
+ * the linux inode, we can reclaim the inode.
+ */
+STATIC void
+xfs_fs_destroy_inode(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ trace_xfs_destroy_inode(ip);
+
+ XFS_STATS_INC(vn_reclaim);
+
+ /* bad inode, get out here ASAP */
+ if (is_bad_inode(inode))
+ goto out_reclaim;
+
+ xfs_ioend_wait(ip);
+
+ ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+
+ /*
+ * We should never get here with one of the reclaim flags already set.
+ */
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+ ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+
+ /*
+ * We always use background reclaim here because even if the
+ * inode is clean, it still may be under IO and hence we have
+ * to take the flush lock. The background reclaim path handles
+ * this more efficiently than we can here, so simply let background
+ * reclaim tear down all inodes.
+ */
+out_reclaim:
+ xfs_inode_set_reclaim_tag(ip);
+}
+
+/*
+ * Slab object creation initialisation for the XFS inode.
+ * This covers only the idempotent fields in the XFS inode;
+ * all other fields need to be initialised on allocation
+ * from the slab. This avoids the need to repeatedly initialise
+ * fields in the xfs inode that left in the initialise state
+ * when freeing the inode.
+ */
+STATIC void
+xfs_fs_inode_init_once(
+ void *inode)
+{
+ struct xfs_inode *ip = inode;
+
+ memset(ip, 0, sizeof(struct xfs_inode));
+
+ /* vfs inode */
+ inode_init_once(VFS_I(ip));
+
+ /* xfs inode */
+ atomic_set(&ip->i_iocount, 0);
+ atomic_set(&ip->i_pincount, 0);
+ spin_lock_init(&ip->i_flags_lock);
+ init_waitqueue_head(&ip->i_ipin_wait);
+ /*
+ * Because we want to use a counting completion, complete
+ * the flush completion once to allow a single access to
+ * the flush completion without blocking.
+ */
+ init_completion(&ip->i_flush);
+ complete(&ip->i_flush);
+
+ mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+ "xfsino", ip->i_ino);
+}
+
+/*
+ * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
+ * we catch unlogged VFS level updates to the inode.
+ *
+ * We need the barrier() to maintain correct ordering between unlogged
+ * updates and the transaction commit code that clears the i_update_core
+ * field. This requires all updates to be completed before marking the
+ * inode dirty.
+ */
+STATIC void
+xfs_fs_dirty_inode(
+ struct inode *inode,
+ int flags)
+{
+ barrier();
+ XFS_I(inode)->i_update_core = 1;
+}
+
+STATIC int
+xfs_fs_write_inode(
+ struct inode *inode,
+ struct writeback_control *wbc)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ int error = EAGAIN;
+
+ trace_xfs_write_inode(ip);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -XFS_ERROR(EIO);
+
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
+ /*
+ * Make sure the inode has made it it into the log. Instead
+ * of forcing it all the way to stable storage using a
+ * synchronous transaction we let the log force inside the
+ * ->sync_fs call do that for thus, which reduces the number
+ * of synchronous log foces dramatically.
+ */
+ xfs_ioend_wait(ip);
+ error = xfs_log_dirty_inode(ip, NULL, 0);
+ if (error)
+ goto out;
+ return 0;
+ } else {
+ if (!ip->i_update_core)
+ return 0;
+
+ /*
+ * We make this non-blocking if the inode is contended, return
+ * EAGAIN to indicate to the caller that they did not succeed.
+ * This prevents the flush path from blocking on inodes inside
+ * another operation right now, they get caught later by
+ * xfs_sync.
+ */
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+ goto out;
+
+ if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+ goto out_unlock;
+
+ /*
+ * Now we have the flush lock and the inode is not pinned, we
+ * can check if the inode is really clean as we know that
+ * there are no pending transaction completions, it is not
+ * waiting on the delayed write queue and there is no IO in
+ * progress.
+ */
+ if (xfs_inode_clean(ip)) {
+ xfs_ifunlock(ip);
+ error = 0;
+ goto out_unlock;
+ }
+ error = xfs_iflush(ip, SYNC_TRYLOCK);
+ }
+
+ out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ out:
+ /*
+ * if we failed to write out the inode then mark
+ * it dirty again so we'll try again later.
+ */
+ if (error)
+ xfs_mark_inode_dirty_sync(ip);
+ return -error;
+}
+
+STATIC void
+xfs_fs_evict_inode(
+ struct inode *inode)
+{
+ xfs_inode_t *ip = XFS_I(inode);
+
+ trace_xfs_evict_inode(ip);
+
+ truncate_inode_pages(&inode->i_data, 0);
+ end_writeback(inode);
+ XFS_STATS_INC(vn_rele);
+ XFS_STATS_INC(vn_remove);
+ XFS_STATS_DEC(vn_active);
+
+ /*
+ * The iolock is used by the file system to coordinate reads,
+ * writes, and block truncates. Up to this point the lock
+ * protected concurrent accesses by users of the inode. But
+ * from here forward we're doing some final processing of the
+ * inode because we're done with it, and although we reuse the
+ * iolock for protection it is really a distinct lock class
+ * (in the lockdep sense) from before. To keep lockdep happy
+ * (and basically indicate what we are doing), we explicitly
+ * re-init the iolock here.
+ */
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+ lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+ &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
+
+ xfs_inactive(ip);
+}
+
+STATIC void
+xfs_free_fsname(
+ struct xfs_mount *mp)
+{
+ kfree(mp->m_fsname);
+ kfree(mp->m_rtname);
+ kfree(mp->m_logname);
+}
+
+STATIC void
+xfs_fs_put_super(
+ struct super_block *sb)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ /*
+ * Unregister the memory shrinker before we tear down the mount
+ * structure so we don't have memory reclaim racing with us here.
+ */
+ xfs_inode_shrinker_unregister(mp);
+ xfs_syncd_stop(mp);
+
+ /*
+ * Blow away any referenced inode in the filestreams cache.
+ * This can and will cause log traffic as inodes go inactive
+ * here.
+ */
+ xfs_filestream_unmount(mp);
+
+ XFS_bflush(mp->m_ddev_targp);
+
+ xfs_unmountfs(mp);
+ xfs_freesb(mp);
+ xfs_icsb_destroy_counters(mp);
+ xfs_close_devices(mp);
+ xfs_free_fsname(mp);
+ kfree(mp);
+}
+
+STATIC int
+xfs_fs_sync_fs(
+ struct super_block *sb,
+ int wait)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+ int error;
+
+ /*
+ * Not much we can do for the first async pass. Writing out the
+ * superblock would be counter-productive as we are going to redirty
+ * when writing out other data and metadata (and writing out a single
+ * block is quite fast anyway).
+ *
+ * Try to asynchronously kick off quota syncing at least.
+ */
+ if (!wait) {
+ xfs_qm_sync(mp, SYNC_TRYLOCK);
+ return 0;
+ }
+
+ error = xfs_quiesce_data(mp);
+ if (error)
+ return -error;
+
+ if (laptop_mode) {
+ /*
+ * The disk must be active because we're syncing.
+ * We schedule xfssyncd now (now that the disk is
+ * active) instead of later (when it might not be).
+ */
+ flush_delayed_work_sync(&mp->m_sync_work);
+ }
+
+ return 0;
+}
+
+STATIC int
+xfs_fs_statfs(
+ struct dentry *dentry,
+ struct kstatfs *statp)
+{
+ struct xfs_mount *mp = XFS_M(dentry->d_sb);
+ xfs_sb_t *sbp = &mp->m_sb;
+ struct xfs_inode *ip = XFS_I(dentry->d_inode);
+ __uint64_t fakeinos, id;
+ xfs_extlen_t lsize;
+ __int64_t ffree;
+
+ statp->f_type = XFS_SB_MAGIC;
+ statp->f_namelen = MAXNAMELEN - 1;
+
+ id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
+ statp->f_fsid.val[0] = (u32)id;
+ statp->f_fsid.val[1] = (u32)(id >> 32);
+
+ xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+
+ spin_lock(&mp->m_sb_lock);
+ statp->f_bsize = sbp->sb_blocksize;
+ lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
+ statp->f_blocks = sbp->sb_dblocks - lsize;
+ statp->f_bfree = statp->f_bavail =
+ sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+ fakeinos = statp->f_bfree << sbp->sb_inopblog;
+ statp->f_files =
+ MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
+ if (mp->m_maxicount)
+ statp->f_files = min_t(typeof(statp->f_files),
+ statp->f_files,
+ mp->m_maxicount);
+
+ /* make sure statp->f_ffree does not underflow */
+ ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+ statp->f_ffree = max_t(__int64_t, ffree, 0);
+
+ spin_unlock(&mp->m_sb_lock);
+
+ if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
+ ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
+ (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+ xfs_qm_statvfs(ip, statp);
+ return 0;
+}
+
+STATIC void
+xfs_save_resvblks(struct xfs_mount *mp)
+{
+ __uint64_t resblks = 0;
+
+ mp->m_resblks_save = mp->m_resblks;
+ xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+STATIC void
+xfs_restore_resvblks(struct xfs_mount *mp)
+{
+ __uint64_t resblks;
+
+ if (mp->m_resblks_save) {
+ resblks = mp->m_resblks_save;
+ mp->m_resblks_save = 0;
+ } else
+ resblks = xfs_default_resblks(mp);
+
+ xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+STATIC int
+xfs_fs_remount(
+ struct super_block *sb,
+ int *flags,
+ char *options)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+ substring_t args[MAX_OPT_ARGS];
+ char *p;
+ int error;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_barrier:
+ mp->m_flags |= XFS_MOUNT_BARRIER;
+ break;
+ case Opt_nobarrier:
+ mp->m_flags &= ~XFS_MOUNT_BARRIER;
+ break;
+ default:
+ /*
+ * Logically we would return an error here to prevent
+ * users from believing they might have changed
+ * mount options using remount which can't be changed.
+ *
+ * But unfortunately mount(8) adds all options from
+ * mtab and fstab to the mount arguments in some cases
+ * so we can't blindly reject options, but have to
+ * check for each specified option if it actually
+ * differs from the currently set option and only
+ * reject it if that's the case.
+ *
+ * Until that is implemented we return success for
+ * every remount request, and silently ignore all
+ * options that we can't actually change.
+ */
+#if 0
+ xfs_info(mp,
+ "mount option \"%s\" not supported for remount\n", p);
+ return -EINVAL;
+#else
+ break;
+#endif
+ }
+ }
+
+ /* ro -> rw */
+ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+ mp->m_flags &= ~XFS_MOUNT_RDONLY;
+
+ /*
+ * If this is the first remount to writeable state we
+ * might have some superblock changes to update.
+ */
+ if (mp->m_update_flags) {
+ error = xfs_mount_log_sb(mp, mp->m_update_flags);
+ if (error) {
+ xfs_warn(mp, "failed to write sb changes");
+ return error;
+ }
+ mp->m_update_flags = 0;
+ }
+
+ /*
+ * Fill out the reserve pool if it is empty. Use the stashed
+ * value if it is non-zero, otherwise go with the default.
+ */
+ xfs_restore_resvblks(mp);
+ }
+
+ /* rw -> ro */
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+ /*
+ * After we have synced the data but before we sync the
+ * metadata, we need to free up the reserve block pool so that
+ * the used block count in the superblock on disk is correct at
+ * the end of the remount. Stash the current reserve pool size
+ * so that if we get remounted rw, we can return it to the same
+ * size.
+ */
+
+ xfs_quiesce_data(mp);
+ xfs_save_resvblks(mp);
+ xfs_quiesce_attr(mp);
+ mp->m_flags |= XFS_MOUNT_RDONLY;
+ }
+
+ return 0;
+}
+
+/*
+ * Second stage of a freeze. The data is already frozen so we only
+ * need to take care of the metadata. Once that's done write a dummy
+ * record to dirty the log in case of a crash while frozen.
+ */
+STATIC int
+xfs_fs_freeze(
+ struct super_block *sb)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ xfs_save_resvblks(mp);
+ xfs_quiesce_attr(mp);
+ return -xfs_fs_log_dummy(mp);
+}
+
+STATIC int
+xfs_fs_unfreeze(
+ struct super_block *sb)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ xfs_restore_resvblks(mp);
+ return 0;
+}
+
+STATIC int
+xfs_fs_show_options(
+ struct seq_file *m,
+ struct vfsmount *mnt)
+{
+ return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
+}
+
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock _has_ now been read in.
+ */
+STATIC int
+xfs_finish_flags(
+ struct xfs_mount *mp)
+{
+ int ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
+
+ /* Fail a mount where the logbuf is smaller than the log stripe */
+ if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+ if (mp->m_logbsize <= 0 &&
+ mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
+ mp->m_logbsize = mp->m_sb.sb_logsunit;
+ } else if (mp->m_logbsize > 0 &&
+ mp->m_logbsize < mp->m_sb.sb_logsunit) {
+ xfs_warn(mp,
+ "logbuf size must be greater than or equal to log stripe size");
+ return XFS_ERROR(EINVAL);
+ }
+ } else {
+ /* Fail a mount if the logbuf is larger than 32K */
+ if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
+ xfs_warn(mp,
+ "logbuf size for version 1 logs must be 16K or 32K");
+ return XFS_ERROR(EINVAL);
+ }
+ }
+
+ /*
+ * mkfs'ed attr2 will turn on attr2 mount unless explicitly
+ * told by noattr2 to turn it off
+ */
+ if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+ !(mp->m_flags & XFS_MOUNT_NOATTR2))
+ mp->m_flags |= XFS_MOUNT_ATTR2;
+
+ /*
+ * prohibit r/w mounts of read-only filesystems
+ */
+ if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
+ xfs_warn(mp,
+ "cannot mount a read-only filesystem as read-write");
+ return XFS_ERROR(EROFS);
+ }
+
+ return 0;
+}
+
+STATIC int
+xfs_fs_fill_super(
+ struct super_block *sb,
+ void *data,
+ int silent)
+{
+ struct inode *root;
+ struct xfs_mount *mp = NULL;
+ int flags = 0, error = ENOMEM;
+
+ mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
+ if (!mp)
+ goto out;
+
+ spin_lock_init(&mp->m_sb_lock);
+ mutex_init(&mp->m_growlock);
+ atomic_set(&mp->m_active_trans, 0);
+
+ mp->m_super = sb;
+ sb->s_fs_info = mp;
+
+ error = xfs_parseargs(mp, (char *)data);
+ if (error)
+ goto out_free_fsname;
+
+ sb_min_blocksize(sb, BBSIZE);
+ sb->s_xattr = xfs_xattr_handlers;
+ sb->s_export_op = &xfs_export_operations;
+#ifdef CONFIG_XFS_QUOTA
+ sb->s_qcop = &xfs_quotactl_operations;
+#endif
+ sb->s_op = &xfs_super_operations;
+
+ if (silent)
+ flags |= XFS_MFSI_QUIET;
+
+ error = xfs_open_devices(mp);
+ if (error)
+ goto out_free_fsname;
+
+ error = xfs_icsb_init_counters(mp);
+ if (error)
+ goto out_close_devices;
+
+ error = xfs_readsb(mp, flags);
+ if (error)
+ goto out_destroy_counters;
+
+ error = xfs_finish_flags(mp);
+ if (error)
+ goto out_free_sb;
+
+ error = xfs_setup_devices(mp);
+ if (error)
+ goto out_free_sb;
+
+ error = xfs_filestream_mount(mp);
+ if (error)
+ goto out_free_sb;
+
+ /*
+ * we must configure the block size in the superblock before we run the
+ * full mount process as the mount process can lookup and cache inodes.
+ * For the same reason we must also initialise the syncd and register
+ * the inode cache shrinker so that inodes can be reclaimed during
+ * operations like a quotacheck that iterate all inodes in the
+ * filesystem.
+ */
+ sb->s_magic = XFS_SB_MAGIC;
+ sb->s_blocksize = mp->m_sb.sb_blocksize;
+ sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
+ sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
+ sb->s_time_gran = 1;
+ set_posix_acl_flag(sb);
+
+ xfs_inode_shrinker_register(mp);
+
+ error = xfs_mountfs(mp);
+ if (error)
+ goto out_filestream_unmount;
+
+ error = xfs_syncd_init(mp);
+ if (error)
+ goto out_unmount;
+
+ root = igrab(VFS_I(mp->m_rootip));
+ if (!root) {
+ error = ENOENT;
+ goto out_syncd_stop;
+ }
+ if (is_bad_inode(root)) {
+ error = EINVAL;
+ goto out_syncd_stop;
+ }
+ sb->s_root = d_alloc_root(root);
+ if (!sb->s_root) {
+ error = ENOMEM;
+ goto out_iput;
+ }
+
+ return 0;
+
+ out_filestream_unmount:
+ xfs_inode_shrinker_unregister(mp);
+ xfs_filestream_unmount(mp);
+ out_free_sb:
+ xfs_freesb(mp);
+ out_destroy_counters:
+ xfs_icsb_destroy_counters(mp);
+ out_close_devices:
+ xfs_close_devices(mp);
+ out_free_fsname:
+ xfs_free_fsname(mp);
+ kfree(mp);
+ out:
+ return -error;
+
+ out_iput:
+ iput(root);
+ out_syncd_stop:
+ xfs_syncd_stop(mp);
+ out_unmount:
+ xfs_inode_shrinker_unregister(mp);
+
+ /*
+ * Blow away any referenced inode in the filestreams cache.
+ * This can and will cause log traffic as inodes go inactive
+ * here.
+ */
+ xfs_filestream_unmount(mp);
+
+ XFS_bflush(mp->m_ddev_targp);
+
+ xfs_unmountfs(mp);
+ goto out_free_sb;
+}
+
+STATIC struct dentry *
+xfs_fs_mount(
+ struct file_system_type *fs_type,
+ int flags,
+ const char *dev_name,
+ void *data)
+{
+ return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+}
+
+static const struct super_operations xfs_super_operations = {
+ .alloc_inode = xfs_fs_alloc_inode,
+ .destroy_inode = xfs_fs_destroy_inode,
+ .dirty_inode = xfs_fs_dirty_inode,
+ .write_inode = xfs_fs_write_inode,
+ .evict_inode = xfs_fs_evict_inode,
+ .put_super = xfs_fs_put_super,
+ .sync_fs = xfs_fs_sync_fs,
+ .freeze_fs = xfs_fs_freeze,
+ .unfreeze_fs = xfs_fs_unfreeze,
+ .statfs = xfs_fs_statfs,
+ .remount_fs = xfs_fs_remount,
+ .show_options = xfs_fs_show_options,
+};
+
+static struct file_system_type xfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "xfs",
+ .mount = xfs_fs_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+STATIC int __init
+xfs_init_zones(void)
+{
+
+ xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
+ if (!xfs_ioend_zone)
+ goto out;
+
+ xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
+ xfs_ioend_zone);
+ if (!xfs_ioend_pool)
+ goto out_destroy_ioend_zone;
+
+ xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+ "xfs_log_ticket");
+ if (!xfs_log_ticket_zone)
+ goto out_destroy_ioend_pool;
+
+ xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
+ "xfs_bmap_free_item");
+ if (!xfs_bmap_free_item_zone)
+ goto out_destroy_log_ticket_zone;
+
+ xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
+ "xfs_btree_cur");
+ if (!xfs_btree_cur_zone)
+ goto out_destroy_bmap_free_item_zone;
+
+ xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
+ "xfs_da_state");
+ if (!xfs_da_state_zone)
+ goto out_destroy_btree_cur_zone;
+
+ xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
+ if (!xfs_dabuf_zone)
+ goto out_destroy_da_state_zone;
+
+ xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+ if (!xfs_ifork_zone)
+ goto out_destroy_dabuf_zone;
+
+ xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+ if (!xfs_trans_zone)
+ goto out_destroy_ifork_zone;
+
+ xfs_log_item_desc_zone =
+ kmem_zone_init(sizeof(struct xfs_log_item_desc),
+ "xfs_log_item_desc");
+ if (!xfs_log_item_desc_zone)
+ goto out_destroy_trans_zone;
+
+ /*
+ * The size of the zone allocated buf log item is the maximum
+ * size possible under XFS. This wastes a little bit of memory,
+ * but it is much faster.
+ */
+ xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
+ (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
+ NBWORD) * sizeof(int))), "xfs_buf_item");
+ if (!xfs_buf_item_zone)
+ goto out_destroy_log_item_desc_zone;
+
+ xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
+ ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
+ sizeof(xfs_extent_t))), "xfs_efd_item");
+ if (!xfs_efd_zone)
+ goto out_destroy_buf_item_zone;
+
+ xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
+ ((XFS_EFI_MAX_FAST_EXTENTS - 1) *
+ sizeof(xfs_extent_t))), "xfs_efi_item");
+ if (!xfs_efi_zone)
+ goto out_destroy_efd_zone;
+
+ xfs_inode_zone =
+ kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
+ KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
+ xfs_fs_inode_init_once);
+ if (!xfs_inode_zone)
+ goto out_destroy_efi_zone;
+
+ xfs_ili_zone =
+ kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
+ KM_ZONE_SPREAD, NULL);
+ if (!xfs_ili_zone)
+ goto out_destroy_inode_zone;
+
+ return 0;
+
+ out_destroy_inode_zone:
+ kmem_zone_destroy(xfs_inode_zone);
+ out_destroy_efi_zone:
+ kmem_zone_destroy(xfs_efi_zone);
+ out_destroy_efd_zone:
+ kmem_zone_destroy(xfs_efd_zone);
+ out_destroy_buf_item_zone:
+ kmem_zone_destroy(xfs_buf_item_zone);
+ out_destroy_log_item_desc_zone:
+ kmem_zone_destroy(xfs_log_item_desc_zone);
+ out_destroy_trans_zone:
+ kmem_zone_destroy(xfs_trans_zone);
+ out_destroy_ifork_zone:
+ kmem_zone_destroy(xfs_ifork_zone);
+ out_destroy_dabuf_zone:
+ kmem_zone_destroy(xfs_dabuf_zone);
+ out_destroy_da_state_zone:
+ kmem_zone_destroy(xfs_da_state_zone);
+ out_destroy_btree_cur_zone:
+ kmem_zone_destroy(xfs_btree_cur_zone);
+ out_destroy_bmap_free_item_zone:
+ kmem_zone_destroy(xfs_bmap_free_item_zone);
+ out_destroy_log_ticket_zone:
+ kmem_zone_destroy(xfs_log_ticket_zone);
+ out_destroy_ioend_pool:
+ mempool_destroy(xfs_ioend_pool);
+ out_destroy_ioend_zone:
+ kmem_zone_destroy(xfs_ioend_zone);
+ out:
+ return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_zones(void)
+{
+ kmem_zone_destroy(xfs_ili_zone);
+ kmem_zone_destroy(xfs_inode_zone);
+ kmem_zone_destroy(xfs_efi_zone);
+ kmem_zone_destroy(xfs_efd_zone);
+ kmem_zone_destroy(xfs_buf_item_zone);
+ kmem_zone_destroy(xfs_log_item_desc_zone);
+ kmem_zone_destroy(xfs_trans_zone);
+ kmem_zone_destroy(xfs_ifork_zone);
+ kmem_zone_destroy(xfs_dabuf_zone);
+ kmem_zone_destroy(xfs_da_state_zone);
+ kmem_zone_destroy(xfs_btree_cur_zone);
+ kmem_zone_destroy(xfs_bmap_free_item_zone);
+ kmem_zone_destroy(xfs_log_ticket_zone);
+ mempool_destroy(xfs_ioend_pool);
+ kmem_zone_destroy(xfs_ioend_zone);
+
+}
+
+STATIC int __init
+xfs_init_workqueues(void)
+{
+ /*
+ * max_active is set to 8 to give enough concurency to allow
+ * multiple work operations on each CPU to run. This allows multiple
+ * filesystems to be running sync work concurrently, and scales with
+ * the number of CPUs in the system.
+ */
+ xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
+ if (!xfs_syncd_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+STATIC void
+xfs_destroy_workqueues(void)
+{
+ destroy_workqueue(xfs_syncd_wq);
+}
+
+STATIC int __init
+init_xfs_fs(void)
+{
+ int error;
+
+ printk(KERN_INFO XFS_VERSION_STRING " with "
+ XFS_BUILD_OPTIONS " enabled\n");
+
+ xfs_ioend_init();
+ xfs_dir_startup();
+
+ error = xfs_init_zones();
+ if (error)
+ goto out;
+
+ error = xfs_init_workqueues();
+ if (error)
+ goto out_destroy_zones;
+
+ error = xfs_mru_cache_init();
+ if (error)
+ goto out_destroy_wq;
+
+ error = xfs_filestream_init();
+ if (error)
+ goto out_mru_cache_uninit;
+
+ error = xfs_buf_init();
+ if (error)
+ goto out_filestream_uninit;
+
+ error = xfs_init_procfs();
+ if (error)
+ goto out_buf_terminate;
+
+ error = xfs_sysctl_register();
+ if (error)
+ goto out_cleanup_procfs;
+
+ vfs_initquota();
+
+ error = register_filesystem(&xfs_fs_type);
+ if (error)
+ goto out_sysctl_unregister;
+ return 0;
+
+ out_sysctl_unregister:
+ xfs_sysctl_unregister();
+ out_cleanup_procfs:
+ xfs_cleanup_procfs();
+ out_buf_terminate:
+ xfs_buf_terminate();
+ out_filestream_uninit:
+ xfs_filestream_uninit();
+ out_mru_cache_uninit:
+ xfs_mru_cache_uninit();
+ out_destroy_wq:
+ xfs_destroy_workqueues();
+ out_destroy_zones:
+ xfs_destroy_zones();
+ out:
+ return error;
+}
+
+STATIC void __exit
+exit_xfs_fs(void)
+{
+ vfs_exitquota();
+ unregister_filesystem(&xfs_fs_type);
+ xfs_sysctl_unregister();
+ xfs_cleanup_procfs();
+ xfs_buf_terminate();
+ xfs_filestream_uninit();
+ xfs_mru_cache_uninit();
+ xfs_destroy_workqueues();
+ xfs_destroy_zones();
+}
+
+module_init(init_xfs_fs);
+module_exit(exit_xfs_fs);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
+MODULE_LICENSE("GPL");
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
new file mode 100644
index 00000000..50a3266c
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SUPER_H__
+#define __XFS_SUPER_H__
+
+#include <linux/exportfs.h>
+
+#ifdef CONFIG_XFS_QUOTA
+extern void xfs_qm_init(void);
+extern void xfs_qm_exit(void);
+# define vfs_initquota() xfs_qm_init()
+# define vfs_exitquota() xfs_qm_exit()
+#else
+# define vfs_initquota() do { } while (0)
+# define vfs_exitquota() do { } while (0)
+#endif
+
+#ifdef CONFIG_XFS_POSIX_ACL
+# define XFS_ACL_STRING "ACLs, "
+# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL)
+#else
+# define XFS_ACL_STRING
+# define set_posix_acl_flag(sb) do { } while (0)
+#endif
+
+#define XFS_SECURITY_STRING "security attributes, "
+
+#ifdef CONFIG_XFS_RT
+# define XFS_REALTIME_STRING "realtime, "
+#else
+# define XFS_REALTIME_STRING
+#endif
+
+#if XFS_BIG_BLKNOS
+# if XFS_BIG_INUMS
+# define XFS_BIGFS_STRING "large block/inode numbers, "
+# else
+# define XFS_BIGFS_STRING "large block numbers, "
+# endif
+#else
+# define XFS_BIGFS_STRING
+#endif
+
+#ifdef DEBUG
+# define XFS_DBG_STRING "debug"
+#else
+# define XFS_DBG_STRING "no debug"
+#endif
+
+#define XFS_VERSION_STRING "SGI XFS"
+#define XFS_BUILD_OPTIONS XFS_ACL_STRING \
+ XFS_SECURITY_STRING \
+ XFS_REALTIME_STRING \
+ XFS_BIGFS_STRING \
+ XFS_DBG_STRING /* DBG must be last */
+
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_buftarg;
+struct block_device;
+
+extern __uint64_t xfs_max_file_offset(unsigned int);
+
+extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
+
+extern const struct export_operations xfs_export_operations;
+extern const struct xattr_handler *xfs_xattr_handlers[];
+extern const struct quotactl_ops xfs_quotactl_operations;
+
+#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
+
+#endif /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
new file mode 100644
index 00000000..2f277a04
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -0,0 +1,1132 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_fsops.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH 32
+
+STATIC int
+xfs_inode_ag_walk_grab(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ ASSERT(rcu_read_lock_held());
+
+ /*
+ * check for stale RCU freed inode
+ *
+ * If the inode has been reallocated, it doesn't matter if it's not in
+ * the AG we are walking - we are walking for writeback, so if it
+ * passes all the "valid inode" checks and is dirty, then we'll write
+ * it back anyway. If it has been reallocated and still being
+ * initialised, the XFS_INEW check below will catch it.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (!ip->i_ino)
+ goto out_unlock_noent;
+
+ /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+ if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+ goto out_unlock_noent;
+ spin_unlock(&ip->i_flags_lock);
+
+ /* nothing to sync during shutdown */
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return EFSCORRUPTED;
+
+ /* If we can't grab the inode, it must on it's way to reclaim. */
+ if (!igrab(inode))
+ return ENOENT;
+
+ if (is_bad_inode(inode)) {
+ IRELE(ip);
+ return ENOENT;
+ }
+
+ /* inode is valid */
+ return 0;
+
+out_unlock_noent:
+ spin_unlock(&ip->i_flags_lock);
+ return ENOENT;
+}
+
+STATIC int
+xfs_inode_ag_walk(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ int (*execute)(struct xfs_inode *ip,
+ struct xfs_perag *pag, int flags),
+ int flags)
+{
+ uint32_t first_index;
+ int last_error = 0;
+ int skipped;
+ int done;
+ int nr_found;
+
+restart:
+ done = 0;
+ skipped = 0;
+ first_index = 0;
+ nr_found = 0;
+ do {
+ struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+ int error = 0;
+ int i;
+
+ rcu_read_lock();
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+ (void **)batch, first_index,
+ XFS_LOOKUP_BATCH);
+ if (!nr_found) {
+ rcu_read_unlock();
+ break;
+ }
+
+ /*
+ * Grab the inodes before we drop the lock. if we found
+ * nothing, nr == 0 and the loop will be skipped.
+ */
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_inode *ip = batch[i];
+
+ if (done || xfs_inode_ag_walk_grab(ip))
+ batch[i] = NULL;
+
+ /*
+ * Update the index for the next lookup. Catch
+ * overflows into the next AG range which can occur if
+ * we have inodes in the last block of the AG and we
+ * are currently pointing to the last inode.
+ *
+ * Because we may see inodes that are from the wrong AG
+ * due to RCU freeing and reallocation, only update the
+ * index if it lies in this AG. It was a race that lead
+ * us to see this inode, so another lookup from the
+ * same index will not find it again.
+ */
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ continue;
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = 1;
+ }
+
+ /* unlock now we've grabbed the inodes. */
+ rcu_read_unlock();
+
+ for (i = 0; i < nr_found; i++) {
+ if (!batch[i])
+ continue;
+ error = execute(batch[i], pag, flags);
+ IRELE(batch[i]);
+ if (error == EAGAIN) {
+ skipped++;
+ continue;
+ }
+ if (error && last_error != EFSCORRUPTED)
+ last_error = error;
+ }
+
+ /* bail out if the filesystem is corrupted. */
+ if (error == EFSCORRUPTED)
+ break;
+
+ } while (nr_found && !done);
+
+ if (skipped) {
+ delay(1);
+ goto restart;
+ }
+ return last_error;
+}
+
+int
+xfs_inode_ag_iterator(
+ struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip,
+ struct xfs_perag *pag, int flags),
+ int flags)
+{
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t ag;
+
+ ag = 0;
+ while ((pag = xfs_perag_get(mp, ag))) {
+ ag = pag->pag_agno + 1;
+ error = xfs_inode_ag_walk(mp, pag, execute, flags);
+ xfs_perag_put(pag);
+ if (error) {
+ last_error = error;
+ if (error == EFSCORRUPTED)
+ break;
+ }
+ }
+ return XFS_ERROR(last_error);
+}
+
+STATIC int
+xfs_sync_inode_data(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ int flags)
+{
+ struct inode *inode = VFS_I(ip);
+ struct address_space *mapping = inode->i_mapping;
+ int error = 0;
+
+ if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ goto out_wait;
+
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
+ if (flags & SYNC_TRYLOCK)
+ goto out_wait;
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ }
+
+ error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
+ 0 : XBF_ASYNC, FI_NONE);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ out_wait:
+ if (flags & SYNC_WAIT)
+ xfs_ioend_wait(ip);
+ return error;
+}
+
+STATIC int
+xfs_sync_inode_attr(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ int flags)
+{
+ int error = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_inode_clean(ip))
+ goto out_unlock;
+ if (!xfs_iflock_nowait(ip)) {
+ if (!(flags & SYNC_WAIT))
+ goto out_unlock;
+ xfs_iflock(ip);
+ }
+
+ if (xfs_inode_clean(ip)) {
+ xfs_ifunlock(ip);
+ goto out_unlock;
+ }
+
+ error = xfs_iflush(ip, flags);
+
+ /*
+ * We don't want to try again on non-blocking flushes that can't run
+ * again immediately. If an inode really must be written, then that's
+ * what the SYNC_WAIT flag is for.
+ */
+ if (error == EAGAIN) {
+ ASSERT(!(flags & SYNC_WAIT));
+ error = 0;
+ }
+
+ out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
+/*
+ * Write out pagecache data for the whole filesystem.
+ */
+STATIC int
+xfs_sync_data(
+ struct xfs_mount *mp,
+ int flags)
+{
+ int error;
+
+ ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
+
+ error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
+ if (error)
+ return XFS_ERROR(error);
+
+ xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
+ return 0;
+}
+
+/*
+ * Write out inode metadata (attributes) for the whole filesystem.
+ */
+STATIC int
+xfs_sync_attr(
+ struct xfs_mount *mp,
+ int flags)
+{
+ ASSERT((flags & ~SYNC_WAIT) == 0);
+
+ return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
+}
+
+STATIC int
+xfs_sync_fsdata(
+ struct xfs_mount *mp)
+{
+ struct xfs_buf *bp;
+
+ /*
+ * If the buffer is pinned then push on the log so we won't get stuck
+ * waiting in the write for someone, maybe ourselves, to flush the log.
+ *
+ * Even though we just pushed the log above, we did not have the
+ * superblock buffer locked at that point so it can become pinned in
+ * between there and here.
+ */
+ bp = xfs_getsb(mp, 0);
+ if (XFS_BUF_ISPINNED(bp))
+ xfs_log_force(mp, 0);
+
+ return xfs_bwrite(mp, bp);
+}
+
+int
+xfs_log_dirty_inode(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ int flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ if (!ip->i_update_core)
+ return 0;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return xfs_trans_commit(tp, 0);
+}
+
+/*
+ * When remounting a filesystem read-only or freezing the filesystem, we have
+ * two phases to execute. This first phase is syncing the data before we
+ * quiesce the filesystem, and the second is flushing all the inodes out after
+ * we've waited for all the transactions created by the first phase to
+ * complete. The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ */
+/*
+ * First stage of freeze - no writers will make progress now we are here,
+ * so we flush delwri and delalloc buffers here, then wait for all I/O to
+ * complete. Data is frozen at that point. Metadata is not frozen,
+ * transactions can still occur here so don't bother flushing the buftarg
+ * because it'll just get dirty again.
+ */
+int
+xfs_quiesce_data(
+ struct xfs_mount *mp)
+{
+ int error, error2 = 0;
+
+ /* push non-blocking */
+ xfs_sync_data(mp, 0);
+ xfs_qm_sync(mp, SYNC_TRYLOCK);
+
+ /* push and block till complete */
+ xfs_sync_data(mp, SYNC_WAIT);
+
+ /*
+ * Log all pending size and timestamp updates. The vfs writeback
+ * code is supposed to do this, but due to its overagressive
+ * livelock detection it will skip inodes where appending writes
+ * were written out in the first non-blocking sync phase if their
+ * completion took long enough that it happened after taking the
+ * timestamp for the cut-off in the blocking phase.
+ */
+ xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
+
+ xfs_qm_sync(mp, SYNC_WAIT);
+
+ /* write superblock and hoover up shutdown errors */
+ error = xfs_sync_fsdata(mp);
+
+ /* make sure all delwri buffers are written out */
+ xfs_flush_buftarg(mp->m_ddev_targp, 1);
+
+ /* mark the log as covered if needed */
+ if (xfs_log_need_covered(mp))
+ error2 = xfs_fs_log_dummy(mp);
+
+ /* flush data-only devices */
+ if (mp->m_rtdev_targp)
+ XFS_bflush(mp->m_rtdev_targp);
+
+ return error ? error : error2;
+}
+
+STATIC void
+xfs_quiesce_fs(
+ struct xfs_mount *mp)
+{
+ int count = 0, pincount;
+
+ xfs_reclaim_inodes(mp, 0);
+ xfs_flush_buftarg(mp->m_ddev_targp, 0);
+
+ /*
+ * This loop must run at least twice. The first instance of the loop
+ * will flush most meta data but that will generate more meta data
+ * (typically directory updates). Which then must be flushed and
+ * logged before we can write the unmount record. We also so sync
+ * reclaim of inodes to catch any that the above delwri flush skipped.
+ */
+ do {
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
+ xfs_sync_attr(mp, SYNC_WAIT);
+ pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+ if (!pincount) {
+ delay(50);
+ count++;
+ }
+ } while (count < 2);
+}
+
+/*
+ * Second stage of a quiesce. The data is already synced, now we have to take
+ * care of the metadata. New transactions are already blocked, so we need to
+ * wait for any remaining transactions to drain out before proceeding.
+ */
+void
+xfs_quiesce_attr(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+
+ /* wait for all modifications to complete */
+ while (atomic_read(&mp->m_active_trans) > 0)
+ delay(100);
+
+ /* flush inodes and push all remaining buffers out to disk */
+ xfs_quiesce_fs(mp);
+
+ /*
+ * Just warn here till VFS can correctly support
+ * read-only remount without racing.
+ */
+ WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+ /* Push the superblock and write an unmount record */
+ error = xfs_log_sbcount(mp, 1);
+ if (error)
+ xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+ "Frozen image may not be consistent.");
+ xfs_log_unmount_write(mp);
+ xfs_unmountfs_writesb(mp);
+}
+
+static void
+xfs_syncd_queue_sync(
+ struct xfs_mount *mp)
+{
+ queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
+ msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Every sync period we need to unpin all items, reclaim inodes and sync
+ * disk quotas. We might need to cover the log to indicate that the
+ * filesystem is idle and not frozen.
+ */
+STATIC void
+xfs_sync_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_sync_work);
+ int error;
+
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ /* dgc: errors ignored here */
+ if (mp->m_super->s_frozen == SB_UNFROZEN &&
+ xfs_log_need_covered(mp))
+ error = xfs_fs_log_dummy(mp);
+ else
+ xfs_log_force(mp, 0);
+ error = xfs_qm_sync(mp, SYNC_TRYLOCK);
+
+ /* start pushing all the metadata that is currently dirty */
+ xfs_ail_push_all(mp->m_ail);
+ }
+
+ /* queue us up again */
+ xfs_syncd_queue_sync(mp);
+}
+
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_syncd_queue_reclaim(
+ struct xfs_mount *mp)
+{
+
+ /*
+ * We can have inodes enter reclaim after we've shut down the syncd
+ * workqueue during unmount, so don't allow reclaim work to be queued
+ * during unmount.
+ */
+ if (!(mp->m_super->s_flags & MS_ACTIVE))
+ return;
+
+ rcu_read_lock();
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+ msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+STATIC void
+xfs_reclaim_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_reclaim_work);
+
+ xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+ xfs_syncd_queue_reclaim(mp);
+}
+
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations. At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room.
+ *
+ * Queue a new data flush if there isn't one already in progress and
+ * wait for completion of the flush. This means that we only ever have one
+ * inode flush in progress no matter how many ENOSPC events are occurring and
+ * so will prevent the system from bogging down due to every concurrent
+ * ENOSPC event scanning all the active inodes in the system for writeback.
+ */
+void
+xfs_flush_inodes(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ queue_work(xfs_syncd_wq, &mp->m_flush_work);
+ flush_work_sync(&mp->m_flush_work);
+}
+
+STATIC void
+xfs_flush_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(work,
+ struct xfs_mount, m_flush_work);
+
+ xfs_sync_data(mp, SYNC_TRYLOCK);
+ xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
+}
+
+int
+xfs_syncd_init(
+ struct xfs_mount *mp)
+{
+ INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+ INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+ INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
+ xfs_syncd_queue_sync(mp);
+ xfs_syncd_queue_reclaim(mp);
+
+ return 0;
+}
+
+void
+xfs_syncd_stop(
+ struct xfs_mount *mp)
+{
+ cancel_delayed_work_sync(&mp->m_sync_work);
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
+ cancel_work_sync(&mp->m_flush_work);
+}
+
+void
+__xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+
+ if (!pag->pag_ici_reclaimable) {
+ /* propagate the reclaim tag up into the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+
+ /* schedule periodic background inode reclaim */
+ xfs_syncd_queue_reclaim(ip->i_mount);
+
+ trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
+ pag->pag_ici_reclaimable++;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+ xfs_inode_t *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+ __xfs_inode_set_reclaim_tag(pag, ip);
+ __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
+STATIC void
+__xfs_inode_clear_reclaim(
+ xfs_perag_t *pag,
+ xfs_inode_t *ip)
+{
+ pag->pag_ici_reclaimable--;
+ if (!pag->pag_ici_reclaimable) {
+ /* clear the reclaim tag from the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+ trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
+}
+
+void
+__xfs_inode_clear_reclaim_tag(
+ xfs_mount_t *mp,
+ xfs_perag_t *pag,
+ xfs_inode_t *ip)
+{
+ radix_tree_tag_clear(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ __xfs_inode_clear_reclaim(pag, ip);
+}
+
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+ struct xfs_inode *ip,
+ int flags)
+{
+ ASSERT(rcu_read_lock_held());
+
+ /* quick check for stale RCU freed inode */
+ if (!ip->i_ino)
+ return 1;
+
+ /*
+ * do some unlocked checks first to avoid unnecessary lock traffic.
+ * The first is a flush lock check, the second is a already in reclaim
+ * check. Only do these checks if we are not going to block on locks.
+ */
+ if ((flags & SYNC_TRYLOCK) &&
+ (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+ return 1;
+ }
+
+ /*
+ * The radix tree lock here protects a thread in xfs_iget from racing
+ * with us starting reclaim on the inode. Once we have the
+ * XFS_IRECLAIM flag set it will not touch us.
+ *
+ * Due to RCU lookup, we may find inodes that have been freed and only
+ * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
+ * aren't candidates for reclaim at all, so we must check the
+ * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+ __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+ /* not a reclaim candidate. */
+ spin_unlock(&ip->i_flags_lock);
+ return 1;
+ }
+ __xfs_iflags_set(ip, XFS_IRECLAIM);
+ spin_unlock(&ip->i_flags_lock);
+ return 0;
+}
+
+/*
+ * Inodes in different states need to be treated differently, and the return
+ * value of xfs_iflush is not sufficient to get this right. The following table
+ * lists the inode states and the reclaim actions necessary for non-blocking
+ * reclaim:
+ *
+ *
+ * inode state iflush ret required action
+ * --------------- ---------- ---------------
+ * bad - reclaim
+ * shutdown EIO unpin and reclaim
+ * clean, unpinned 0 reclaim
+ * stale, unpinned 0 reclaim
+ * clean, pinned(*) 0 requeue
+ * stale, pinned EAGAIN requeue
+ * dirty, delwri ok 0 requeue
+ * dirty, delwri blocked EAGAIN requeue
+ * dirty, sync flush 0 reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * As can be seen from the table, the return value of xfs_iflush() is not
+ * sufficient to correctly decide the reclaim action here. The checks in
+ * xfs_iflush() might look like duplicates, but they are not.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean. The clean inode check needs to be done before flushing
+ * the inode delwri otherwise we would loop forever requeuing clean inodes as
+ * we cannot tell apart a successful delwri flush and a clean inode from the
+ * return value of xfs_iflush().
+ *
+ * Note that because the inode is flushed delayed write by background
+ * writeback, the flush lock may already be held here and waiting on it can
+ * result in very long latencies. Hence for sync reclaims, where we wait on the
+ * flush lock, the caller should push out delayed write inodes first before
+ * trying to reclaim them to minimise the amount of time spent waiting. For
+ * background relaim, we just requeue the inode for the next pass.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ * bad => reclaim
+ * shutdown => unpin and reclaim
+ * pinned, delwri => requeue
+ * pinned, sync => unpin
+ * stale => reclaim
+ * clean => reclaim
+ * dirty, delwri => flush and requeue
+ * dirty, sync => flush, wait and reclaim
+ */
+STATIC int
+xfs_reclaim_inode(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ int sync_mode)
+{
+ int error;
+
+restart:
+ error = 0;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (!xfs_iflock_nowait(ip)) {
+ if (!(sync_mode & SYNC_WAIT))
+ goto out;
+
+ /*
+ * If we only have a single dirty inode in a cluster there is
+ * a fair chance that the AIL push may have pushed it into
+ * the buffer, but xfsbufd won't touch it until 30 seconds
+ * from now, and thus we will lock up here.
+ *
+ * Promote the inode buffer to the front of the delwri list
+ * and wake up xfsbufd now.
+ */
+ xfs_promote_inode(ip);
+ xfs_iflock(ip);
+ }
+
+ if (is_bad_inode(VFS_I(ip)))
+ goto reclaim;
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_iunpin_wait(ip);
+ goto reclaim;
+ }
+ if (xfs_ipincount(ip)) {
+ if (!(sync_mode & SYNC_WAIT)) {
+ xfs_ifunlock(ip);
+ goto out;
+ }
+ xfs_iunpin_wait(ip);
+ }
+ if (xfs_iflags_test(ip, XFS_ISTALE))
+ goto reclaim;
+ if (xfs_inode_clean(ip))
+ goto reclaim;
+
+ /*
+ * Now we have an inode that needs flushing.
+ *
+ * We do a nonblocking flush here even if we are doing a SYNC_WAIT
+ * reclaim as we can deadlock with inode cluster removal.
+ * xfs_ifree_cluster() can lock the inode buffer before it locks the
+ * ip->i_lock, and we are doing the exact opposite here. As a result,
+ * doing a blocking xfs_itobp() to get the cluster buffer will result
+ * in an ABBA deadlock with xfs_ifree_cluster().
+ *
+ * As xfs_ifree_cluser() must gather all inodes that are active in the
+ * cache to mark them stale, if we hit this case we don't actually want
+ * to do IO here - we want the inode marked stale so we can simply
+ * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
+ * just unlock the inode, back off and try again. Hopefully the next
+ * pass through will see the stale flag set on the inode.
+ */
+ error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
+ if (sync_mode & SYNC_WAIT) {
+ if (error == EAGAIN) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /* backoff longer than in xfs_ifree_cluster */
+ delay(2);
+ goto restart;
+ }
+ xfs_iflock(ip);
+ goto reclaim;
+ }
+
+ /*
+ * When we have to flush an inode but don't have SYNC_WAIT set, we
+ * flush the inode out using a delwri buffer and wait for the next
+ * call into reclaim to find it in a clean state instead of waiting for
+ * it now. We also don't return errors here - if the error is transient
+ * then the next reclaim pass will flush the inode, and if the error
+ * is permanent then the next sync reclaim will reclaim the inode and
+ * pass on the error.
+ */
+ if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_warn(ip->i_mount,
+ "inode 0x%llx background reclaim flush failed with %d",
+ (long long)ip->i_ino, error);
+ }
+out:
+ xfs_iflags_clear(ip, XFS_IRECLAIM);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /*
+ * We could return EAGAIN here to make reclaim rescan the inode tree in
+ * a short while. However, this just burns CPU time scanning the tree
+ * waiting for IO to complete and xfssyncd never goes back to the idle
+ * state. Instead, return 0 to let the next scheduled background reclaim
+ * attempt to reclaim the inode again.
+ */
+ return 0;
+
+reclaim:
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ XFS_STATS_INC(xs_ig_reclaims);
+ /*
+ * Remove the inode from the per-AG radix tree.
+ *
+ * Because radix_tree_delete won't complain even if the item was never
+ * added to the tree assert that it's been there before to catch
+ * problems with the inode life time early on.
+ */
+ spin_lock(&pag->pag_ici_lock);
+ if (!radix_tree_delete(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+ ASSERT(0);
+ __xfs_inode_clear_reclaim(pag, ip);
+ spin_unlock(&pag->pag_ici_lock);
+
+ /*
+ * Here we do an (almost) spurious inode lock in order to coordinate
+ * with inode cache radix tree lookups. This is because the lookup
+ * can reference the inodes in the cache without taking references.
+ *
+ * We make that OK here by ensuring that we wait until the inode is
+ * unlocked after the lookup before we go ahead and free it. We get
+ * both the ilock and the iolock because the code may need to drop the
+ * ilock one but will still hold the iolock.
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_qm_dqdetach(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+ xfs_inode_free(ip);
+ return error;
+
+}
+
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+ struct xfs_mount *mp,
+ int flags,
+ int *nr_to_scan)
+{
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t ag;
+ int trylock = flags & SYNC_TRYLOCK;
+ int skipped;
+
+restart:
+ ag = 0;
+ skipped = 0;
+ while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+ unsigned long first_index = 0;
+ int done = 0;
+ int nr_found = 0;
+
+ ag = pag->pag_agno + 1;
+
+ if (trylock) {
+ if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+ skipped++;
+ xfs_perag_put(pag);
+ continue;
+ }
+ first_index = pag->pag_ici_reclaim_cursor;
+ } else
+ mutex_lock(&pag->pag_ici_reclaim_lock);
+
+ do {
+ struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+ int i;
+
+ rcu_read_lock();
+ nr_found = radix_tree_gang_lookup_tag(
+ &pag->pag_ici_root,
+ (void **)batch, first_index,
+ XFS_LOOKUP_BATCH,
+ XFS_ICI_RECLAIM_TAG);
+ if (!nr_found) {
+ done = 1;
+ rcu_read_unlock();
+ break;
+ }
+
+ /*
+ * Grab the inodes before we drop the lock. if we found
+ * nothing, nr == 0 and the loop will be skipped.
+ */
+ for (i = 0; i < nr_found; i++) {
+ struct xfs_inode *ip = batch[i];
+
+ if (done || xfs_reclaim_inode_grab(ip, flags))
+ batch[i] = NULL;
+
+ /*
+ * Update the index for the next lookup. Catch
+ * overflows into the next AG range which can
+ * occur if we have inodes in the last block of
+ * the AG and we are currently pointing to the
+ * last inode.
+ *
+ * Because we may see inodes that are from the
+ * wrong AG due to RCU freeing and
+ * reallocation, only update the index if it
+ * lies in this AG. It was a race that lead us
+ * to see this inode, so another lookup from
+ * the same index will not find it again.
+ */
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+ pag->pag_agno)
+ continue;
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+ done = 1;
+ }
+
+ /* unlock now we've grabbed the inodes. */
+ rcu_read_unlock();
+
+ for (i = 0; i < nr_found; i++) {
+ if (!batch[i])
+ continue;
+ error = xfs_reclaim_inode(batch[i], pag, flags);
+ if (error && last_error != EFSCORRUPTED)
+ last_error = error;
+ }
+
+ *nr_to_scan -= XFS_LOOKUP_BATCH;
+
+ } while (nr_found && !done && *nr_to_scan > 0);
+
+ if (trylock && !done)
+ pag->pag_ici_reclaim_cursor = first_index;
+ else
+ pag->pag_ici_reclaim_cursor = 0;
+ mutex_unlock(&pag->pag_ici_reclaim_lock);
+ xfs_perag_put(pag);
+ }
+
+ /*
+ * if we skipped any AG, and we still have scan count remaining, do
+ * another pass this time using blocking reclaim semantics (i.e
+ * waiting on the reclaim locks and ignoring the reclaim cursors). This
+ * ensure that when we get more reclaimers than AGs we block rather
+ * than spin trying to execute reclaim.
+ */
+ if (trylock && skipped && *nr_to_scan > 0) {
+ trylock = 0;
+ goto restart;
+ }
+ return XFS_ERROR(last_error);
+}
+
+int
+xfs_reclaim_inodes(
+ xfs_mount_t *mp,
+ int mode)
+{
+ int nr_to_scan = INT_MAX;
+
+ return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+}
+
+/*
+ * Inode cache shrinker.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doiing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
+ */
+static int
+xfs_reclaim_inode_shrink(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_mount *mp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t ag;
+ int reclaimable;
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
+
+ mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
+ if (nr_to_scan) {
+ /* kick background reclaimer and push the AIL */
+ xfs_syncd_queue_reclaim(mp);
+ xfs_ail_push_all(mp->m_ail);
+
+ if (!(gfp_mask & __GFP_FS))
+ return -1;
+
+ xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
+ &nr_to_scan);
+ /* terminate if we don't exhaust the scan */
+ if (nr_to_scan > 0)
+ return -1;
+ }
+
+ reclaimable = 0;
+ ag = 0;
+ while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+ ag = pag->pag_agno + 1;
+ reclaimable += pag->pag_ici_reclaimable;
+ xfs_perag_put(pag);
+ }
+ return reclaimable;
+}
+
+void
+xfs_inode_shrinker_register(
+ struct xfs_mount *mp)
+{
+ mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
+ mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
+ register_shrinker(&mp->m_inode_shrink);
+}
+
+void
+xfs_inode_shrinker_unregister(
+ struct xfs_mount *mp)
+{
+ unregister_shrinker(&mp->m_inode_shrink);
+}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
new file mode 100644
index 00000000..ef5b2ce4
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+struct xfs_perag;
+
+typedef struct xfs_sync_work {
+ struct list_head w_list;
+ struct xfs_mount *w_mount;
+ void *w_data; /* syncer routine argument */
+ void (*w_syncer)(struct xfs_mount *, void *);
+ struct completion *w_completion;
+} xfs_sync_work_t;
+
+#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
+#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
+
+extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
+
+int xfs_syncd_init(struct xfs_mount *mp);
+void xfs_syncd_stop(struct xfs_mount *mp);
+
+int xfs_quiesce_data(struct xfs_mount *mp);
+void xfs_quiesce_attr(struct xfs_mount *mp);
+
+void xfs_flush_inodes(struct xfs_inode *ip);
+
+int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
+
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+ struct xfs_inode *ip);
+
+int xfs_sync_inode_grab(struct xfs_inode *ip);
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+ int flags);
+
+void xfs_inode_shrinker_register(struct xfs_mount *mp);
+void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
+
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
new file mode 100644
index 00000000..ee2d2ada
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include "xfs_error.h"
+
+static struct ctl_table_header *xfs_table_header;
+
+#ifdef CONFIG_PROC_FS
+STATIC int
+xfs_stats_clear_proc_handler(
+ ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ int c, ret, *valp = ctl->data;
+ __uint32_t vn_active;
+
+ ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+
+ if (!ret && write && *valp) {
+ xfs_notice(NULL, "Clearing xfsstats");
+ for_each_possible_cpu(c) {
+ preempt_disable();
+ /* save vn_active, it's a universal truth! */
+ vn_active = per_cpu(xfsstats, c).vn_active;
+ memset(&per_cpu(xfsstats, c), 0,
+ sizeof(struct xfsstats));
+ per_cpu(xfsstats, c).vn_active = vn_active;
+ preempt_enable();
+ }
+ xfs_stats_clear = 0;
+ }
+
+ return ret;
+}
+
+STATIC int
+xfs_panic_mask_proc_handler(
+ ctl_table *ctl,
+ int write,
+ void __user *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ int ret, *valp = ctl->data;
+
+ ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+ if (!ret && write) {
+ xfs_panic_mask = *valp;
+#ifdef DEBUG
+ xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+ }
+ return ret;
+}
+#endif /* CONFIG_PROC_FS */
+
+static ctl_table xfs_table[] = {
+ {
+ .procname = "irix_sgid_inherit",
+ .data = &xfs_params.sgid_inherit.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.sgid_inherit.min,
+ .extra2 = &xfs_params.sgid_inherit.max
+ },
+ {
+ .procname = "irix_symlink_mode",
+ .data = &xfs_params.symlink_mode.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.symlink_mode.min,
+ .extra2 = &xfs_params.symlink_mode.max
+ },
+ {
+ .procname = "panic_mask",
+ .data = &xfs_params.panic_mask.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = xfs_panic_mask_proc_handler,
+ .extra1 = &xfs_params.panic_mask.min,
+ .extra2 = &xfs_params.panic_mask.max
+ },
+
+ {
+ .procname = "error_level",
+ .data = &xfs_params.error_level.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.error_level.min,
+ .extra2 = &xfs_params.error_level.max
+ },
+ {
+ .procname = "xfssyncd_centisecs",
+ .data = &xfs_params.syncd_timer.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.syncd_timer.min,
+ .extra2 = &xfs_params.syncd_timer.max
+ },
+ {
+ .procname = "inherit_sync",
+ .data = &xfs_params.inherit_sync.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.inherit_sync.min,
+ .extra2 = &xfs_params.inherit_sync.max
+ },
+ {
+ .procname = "inherit_nodump",
+ .data = &xfs_params.inherit_nodump.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.inherit_nodump.min,
+ .extra2 = &xfs_params.inherit_nodump.max
+ },
+ {
+ .procname = "inherit_noatime",
+ .data = &xfs_params.inherit_noatim.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.inherit_noatim.min,
+ .extra2 = &xfs_params.inherit_noatim.max
+ },
+ {
+ .procname = "xfsbufd_centisecs",
+ .data = &xfs_params.xfs_buf_timer.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.xfs_buf_timer.min,
+ .extra2 = &xfs_params.xfs_buf_timer.max
+ },
+ {
+ .procname = "age_buffer_centisecs",
+ .data = &xfs_params.xfs_buf_age.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.xfs_buf_age.min,
+ .extra2 = &xfs_params.xfs_buf_age.max
+ },
+ {
+ .procname = "inherit_nosymlinks",
+ .data = &xfs_params.inherit_nosym.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.inherit_nosym.min,
+ .extra2 = &xfs_params.inherit_nosym.max
+ },
+ {
+ .procname = "rotorstep",
+ .data = &xfs_params.rotorstep.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.rotorstep.min,
+ .extra2 = &xfs_params.rotorstep.max
+ },
+ {
+ .procname = "inherit_nodefrag",
+ .data = &xfs_params.inherit_nodfrg.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.inherit_nodfrg.min,
+ .extra2 = &xfs_params.inherit_nodfrg.max
+ },
+ {
+ .procname = "filestream_centisecs",
+ .data = &xfs_params.fstrm_timer.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.fstrm_timer.min,
+ .extra2 = &xfs_params.fstrm_timer.max,
+ },
+ /* please keep this the last entry */
+#ifdef CONFIG_PROC_FS
+ {
+ .procname = "stats_clear",
+ .data = &xfs_params.stats_clear.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = xfs_stats_clear_proc_handler,
+ .extra1 = &xfs_params.stats_clear.min,
+ .extra2 = &xfs_params.stats_clear.max
+ },
+#endif /* CONFIG_PROC_FS */
+
+ {}
+};
+
+static ctl_table xfs_dir_table[] = {
+ {
+ .procname = "xfs",
+ .mode = 0555,
+ .child = xfs_table
+ },
+ {}
+};
+
+static ctl_table xfs_root_table[] = {
+ {
+ .procname = "fs",
+ .mode = 0555,
+ .child = xfs_dir_table
+ },
+ {}
+};
+
+int
+xfs_sysctl_register(void)
+{
+ xfs_table_header = register_sysctl_table(xfs_root_table);
+ if (!xfs_table_header)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+xfs_sysctl_unregister(void)
+{
+ unregister_sysctl_table(xfs_table_header);
+}
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
new file mode 100644
index 00000000..b9937d45
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SYSCTL_H__
+#define __XFS_SYSCTL_H__
+
+#include <linux/sysctl.h>
+
+/*
+ * Tunable xfs parameters
+ */
+
+typedef struct xfs_sysctl_val {
+ int min;
+ int val;
+ int max;
+} xfs_sysctl_val_t;
+
+typedef struct xfs_param {
+ xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is
+ * not a member of parent dir GID. */
+ xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */
+ xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */
+ xfs_sysctl_val_t error_level; /* Degree of reporting for problems */
+ xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */
+ xfs_sysctl_val_t stats_clear; /* Reset all XFS statistics to zero. */
+ xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */
+ xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
+ xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
+ xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */
+ xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */
+ xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */
+ xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
+ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
+ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
+} xfs_param_t;
+
+/*
+ * xfs_error_level:
+ *
+ * How much error reporting will be done when internal problems are
+ * encountered. These problems normally return an EFSCORRUPTED to their
+ * caller, with no other information reported.
+ *
+ * 0 No error reports
+ * 1 Report EFSCORRUPTED errors that will cause a filesystem shutdown
+ * 5 Report all EFSCORRUPTED errors (all of the above errors, plus any
+ * additional errors that are known to not cause shutdowns)
+ *
+ * xfs_panic_mask bit 0x8 turns the error reports into panics
+ */
+
+enum {
+ /* XFS_REFCACHE_SIZE = 1 */
+ /* XFS_REFCACHE_PURGE = 2 */
+ /* XFS_RESTRICT_CHOWN = 3 */
+ XFS_SGID_INHERIT = 4,
+ XFS_SYMLINK_MODE = 5,
+ XFS_PANIC_MASK = 6,
+ XFS_ERRLEVEL = 7,
+ XFS_SYNCD_TIMER = 8,
+ /* XFS_PROBE_DMAPI = 9 */
+ /* XFS_PROBE_IOOPS = 10 */
+ /* XFS_PROBE_QUOTA = 11 */
+ XFS_STATS_CLEAR = 12,
+ XFS_INHERIT_SYNC = 13,
+ XFS_INHERIT_NODUMP = 14,
+ XFS_INHERIT_NOATIME = 15,
+ XFS_BUF_TIMER = 16,
+ XFS_BUF_AGE = 17,
+ /* XFS_IO_BYPASS = 18 */
+ XFS_INHERIT_NOSYM = 19,
+ XFS_ROTORSTEP = 20,
+ XFS_INHERIT_NODFRG = 21,
+ XFS_FILESTREAM_TIMER = 22,
+};
+
+extern xfs_param_t xfs_params;
+
+#ifdef CONFIG_SYSCTL
+extern int xfs_sysctl_register(void);
+extern void xfs_sysctl_unregister(void);
+#else
+# define xfs_sysctl_register() (0)
+# define xfs_sysctl_unregister() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+#endif /* __XFS_SYSCTL_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
new file mode 100644
index 00000000..88d25d4a
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_mount.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_quota.h"
+#include "xfs_iomap.h"
+#include "xfs_aops.h"
+#include "quota/xfs_dquot_item.h"
+#include "quota/xfs_dquot.h"
+#include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "xfs_trace.h"
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
new file mode 100644
index 00000000..d48b7a57
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -0,0 +1,1776 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xfs
+
+#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XFS_H
+
+#include <linux/tracepoint.h>
+
+struct xfs_agf;
+struct xfs_alloc_arg;
+struct xfs_attr_list_context;
+struct xfs_buf_log_item;
+struct xfs_da_args;
+struct xfs_da_node_entry;
+struct xfs_dquot;
+struct xlog_ticket;
+struct log;
+struct xlog_recover;
+struct xlog_recover_item;
+struct xfs_buf_log_format;
+struct xfs_inode_log_format;
+
+DECLARE_EVENT_CLASS(xfs_attr_list_class,
+ TP_PROTO(struct xfs_attr_list_context *ctx),
+ TP_ARGS(ctx),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(u32, hashval)
+ __field(u32, blkno)
+ __field(u32, offset)
+ __field(void *, alist)
+ __field(int, bufsize)
+ __field(int, count)
+ __field(int, firstu)
+ __field(int, dupcnt)
+ __field(int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+ __entry->ino = ctx->dp->i_ino;
+ __entry->hashval = ctx->cursor->hashval;
+ __entry->blkno = ctx->cursor->blkno;
+ __entry->offset = ctx->cursor->offset;
+ __entry->alist = ctx->alist;
+ __entry->bufsize = ctx->bufsize;
+ __entry->count = ctx->count;
+ __entry->firstu = ctx->firstu;
+ __entry->flags = ctx->flags;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+ "alist 0x%p size %u count %u firstu %u flags %d %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->hashval,
+ __entry->blkno,
+ __entry->offset,
+ __entry->dupcnt,
+ __entry->alist,
+ __entry->bufsize,
+ __entry->count,
+ __entry->firstu,
+ __entry->flags,
+ __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
+ )
+)
+
+#define DEFINE_ATTR_LIST_EVENT(name) \
+DEFINE_EVENT(xfs_attr_list_class, name, \
+ TP_PROTO(struct xfs_attr_list_context *ctx), \
+ TP_ARGS(ctx))
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+
+DECLARE_EVENT_CLASS(xfs_perag_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+ unsigned long caller_ip),
+ TP_ARGS(mp, agno, refcount, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, refcount)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->refcount = refcount;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->refcount,
+ (char *)__entry->caller_ip)
+);
+
+#define DEFINE_PERAG_REF_EVENT(name) \
+DEFINE_EVENT(xfs_perag_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
+
+TRACE_EVENT(xfs_attr_list_node_descend,
+ TP_PROTO(struct xfs_attr_list_context *ctx,
+ struct xfs_da_node_entry *btree),
+ TP_ARGS(ctx, btree),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(u32, hashval)
+ __field(u32, blkno)
+ __field(u32, offset)
+ __field(void *, alist)
+ __field(int, bufsize)
+ __field(int, count)
+ __field(int, firstu)
+ __field(int, dupcnt)
+ __field(int, flags)
+ __field(u32, bt_hashval)
+ __field(u32, bt_before)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+ __entry->ino = ctx->dp->i_ino;
+ __entry->hashval = ctx->cursor->hashval;
+ __entry->blkno = ctx->cursor->blkno;
+ __entry->offset = ctx->cursor->offset;
+ __entry->alist = ctx->alist;
+ __entry->bufsize = ctx->bufsize;
+ __entry->count = ctx->count;
+ __entry->firstu = ctx->firstu;
+ __entry->flags = ctx->flags;
+ __entry->bt_hashval = be32_to_cpu(btree->hashval);
+ __entry->bt_before = be32_to_cpu(btree->before);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+ "alist 0x%p size %u count %u firstu %u flags %d %s "
+ "node hashval %u, node before %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->hashval,
+ __entry->blkno,
+ __entry->offset,
+ __entry->dupcnt,
+ __entry->alist,
+ __entry->bufsize,
+ __entry->count,
+ __entry->firstu,
+ __entry->flags,
+ __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+ __entry->bt_hashval,
+ __entry->bt_before)
+);
+
+TRACE_EVENT(xfs_iext_insert,
+ TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx,
+ struct xfs_bmbt_irec *r, int state, unsigned long caller_ip),
+ TP_ARGS(ip, idx, r, state, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_extnum_t, idx)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_filblks_t, blockcount)
+ __field(xfs_exntst_t, state)
+ __field(int, bmap_state)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->idx = idx;
+ __entry->startoff = r->br_startoff;
+ __entry->startblock = r->br_startblock;
+ __entry->blockcount = r->br_blockcount;
+ __entry->state = r->br_state;
+ __entry->bmap_state = state;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+ "offset %lld block %lld count %lld flag %d caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+ (long)__entry->idx,
+ __entry->startoff,
+ (__int64_t)__entry->startblock,
+ __entry->blockcount,
+ __entry->state,
+ (char *)__entry->caller_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_bmap_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state,
+ unsigned long caller_ip),
+ TP_ARGS(ip, idx, state, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_extnum_t, idx)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_filblks_t, blockcount)
+ __field(xfs_exntst_t, state)
+ __field(int, bmap_state)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ?
+ ip->i_afp : &ip->i_df;
+ struct xfs_bmbt_irec r;
+
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->idx = idx;
+ __entry->startoff = r.br_startoff;
+ __entry->startblock = r.br_startblock;
+ __entry->blockcount = r.br_blockcount;
+ __entry->state = r.br_state;
+ __entry->bmap_state = state;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+ "offset %lld block %lld count %lld flag %d caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+ (long)__entry->idx,
+ __entry->startoff,
+ (__int64_t)__entry->startblock,
+ __entry->blockcount,
+ __entry->state,
+ (char *)__entry->caller_ip)
+)
+
+#define DEFINE_BMAP_EVENT(name) \
+DEFINE_EVENT(xfs_bmap_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
+ unsigned long caller_ip), \
+ TP_ARGS(ip, idx, state, caller_ip))
+DEFINE_BMAP_EVENT(xfs_iext_remove);
+DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
+DEFINE_BMAP_EVENT(xfs_bmap_post_update);
+DEFINE_BMAP_EVENT(xfs_extlist);
+
+DECLARE_EVENT_CLASS(xfs_buf_class,
+ TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
+ TP_ARGS(bp, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_daddr_t, bno)
+ __field(size_t, buffer_length)
+ __field(int, hold)
+ __field(int, pincount)
+ __field(unsigned, lockval)
+ __field(unsigned, flags)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = bp->b_target->bt_dev;
+ __entry->bno = bp->b_bn;
+ __entry->buffer_length = bp->b_buffer_length;
+ __entry->hold = atomic_read(&bp->b_hold);
+ __entry->pincount = atomic_read(&bp->b_pin_count);
+ __entry->lockval = xfs_buf_lock_value(bp);
+ __entry->flags = bp->b_flags;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ "lock %d flags %s caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->bno,
+ __entry->buffer_length,
+ __entry->hold,
+ __entry->pincount,
+ __entry->lockval,
+ __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+ (void *)__entry->caller_ip)
+)
+
+#define DEFINE_BUF_EVENT(name) \
+DEFINE_EVENT(xfs_buf_class, name, \
+ TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
+ TP_ARGS(bp, caller_ip))
+DEFINE_BUF_EVENT(xfs_buf_init);
+DEFINE_BUF_EVENT(xfs_buf_free);
+DEFINE_BUF_EVENT(xfs_buf_hold);
+DEFINE_BUF_EVENT(xfs_buf_rele);
+DEFINE_BUF_EVENT(xfs_buf_iodone);
+DEFINE_BUF_EVENT(xfs_buf_iorequest);
+DEFINE_BUF_EVENT(xfs_buf_bawrite);
+DEFINE_BUF_EVENT(xfs_buf_bdwrite);
+DEFINE_BUF_EVENT(xfs_buf_lock);
+DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_cond_lock);
+DEFINE_BUF_EVENT(xfs_buf_unlock);
+DEFINE_BUF_EVENT(xfs_buf_iowait);
+DEFINE_BUF_EVENT(xfs_buf_iowait_done);
+DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_split);
+DEFINE_BUF_EVENT(xfs_buf_get_uncached);
+DEFINE_BUF_EVENT(xfs_bdstrat_shut);
+DEFINE_BUF_EVENT(xfs_buf_item_relse);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
+DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
+
+/* not really buffer traces, but the buf provides useful information */
+DEFINE_BUF_EVENT(xfs_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_reset_dqcounts);
+DEFINE_BUF_EVENT(xfs_inode_item_push);
+
+/* pass flags explicitly */
+DECLARE_EVENT_CLASS(xfs_buf_flags_class,
+ TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip),
+ TP_ARGS(bp, flags, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_daddr_t, bno)
+ __field(size_t, buffer_length)
+ __field(int, hold)
+ __field(int, pincount)
+ __field(unsigned, lockval)
+ __field(unsigned, flags)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = bp->b_target->bt_dev;
+ __entry->bno = bp->b_bn;
+ __entry->buffer_length = bp->b_buffer_length;
+ __entry->flags = flags;
+ __entry->hold = atomic_read(&bp->b_hold);
+ __entry->pincount = atomic_read(&bp->b_pin_count);
+ __entry->lockval = xfs_buf_lock_value(bp);
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ "lock %d flags %s caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->bno,
+ __entry->buffer_length,
+ __entry->hold,
+ __entry->pincount,
+ __entry->lockval,
+ __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+ (void *)__entry->caller_ip)
+)
+
+#define DEFINE_BUF_FLAGS_EVENT(name) \
+DEFINE_EVENT(xfs_buf_flags_class, name, \
+ TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
+ TP_ARGS(bp, flags, caller_ip))
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
+
+TRACE_EVENT(xfs_buf_ioerror,
+ TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip),
+ TP_ARGS(bp, error, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_daddr_t, bno)
+ __field(size_t, buffer_length)
+ __field(unsigned, flags)
+ __field(int, hold)
+ __field(int, pincount)
+ __field(unsigned, lockval)
+ __field(int, error)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = bp->b_target->bt_dev;
+ __entry->bno = bp->b_bn;
+ __entry->buffer_length = bp->b_buffer_length;
+ __entry->hold = atomic_read(&bp->b_hold);
+ __entry->pincount = atomic_read(&bp->b_pin_count);
+ __entry->lockval = xfs_buf_lock_value(bp);
+ __entry->error = error;
+ __entry->flags = bp->b_flags;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ "lock %d error %d flags %s caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->bno,
+ __entry->buffer_length,
+ __entry->hold,
+ __entry->pincount,
+ __entry->lockval,
+ __entry->error,
+ __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+ (void *)__entry->caller_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_buf_item_class,
+ TP_PROTO(struct xfs_buf_log_item *bip),
+ TP_ARGS(bip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_daddr_t, buf_bno)
+ __field(size_t, buf_len)
+ __field(int, buf_hold)
+ __field(int, buf_pincount)
+ __field(int, buf_lockval)
+ __field(unsigned, buf_flags)
+ __field(unsigned, bli_recur)
+ __field(int, bli_refcount)
+ __field(unsigned, bli_flags)
+ __field(void *, li_desc)
+ __field(unsigned, li_flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = bip->bli_buf->b_target->bt_dev;
+ __entry->bli_flags = bip->bli_flags;
+ __entry->bli_recur = bip->bli_recur;
+ __entry->bli_refcount = atomic_read(&bip->bli_refcount);
+ __entry->buf_bno = bip->bli_buf->b_bn;
+ __entry->buf_len = bip->bli_buf->b_buffer_length;
+ __entry->buf_flags = bip->bli_buf->b_flags;
+ __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
+ __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
+ __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf);
+ __entry->li_desc = bip->bli_item.li_desc;
+ __entry->li_flags = bip->bli_item.li_flags;
+ ),
+ TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+ "lock %d flags %s recur %d refcount %d bliflags %s "
+ "lidesc 0x%p liflags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->buf_bno,
+ __entry->buf_len,
+ __entry->buf_hold,
+ __entry->buf_pincount,
+ __entry->buf_lockval,
+ __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS),
+ __entry->bli_recur,
+ __entry->bli_refcount,
+ __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
+ __entry->li_desc,
+ __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_BUF_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_buf_item_class, name, \
+ TP_PROTO(struct xfs_buf_log_item *bip), \
+ TP_ARGS(bip))
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+
+DECLARE_EVENT_CLASS(xfs_lock_class,
+ TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
+ unsigned long caller_ip),
+ TP_ARGS(ip, lock_flags, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, lock_flags)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->lock_flags = lock_flags;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
+ (void *)__entry->caller_ip)
+)
+
+#define DEFINE_LOCK_EVENT(name) \
+DEFINE_EVENT(xfs_lock_class, name, \
+ TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
+ unsigned long caller_ip), \
+ TP_ARGS(ip, lock_flags, caller_ip))
+DEFINE_LOCK_EVENT(xfs_ilock);
+DEFINE_LOCK_EVENT(xfs_ilock_nowait);
+DEFINE_LOCK_EVENT(xfs_ilock_demote);
+DEFINE_LOCK_EVENT(xfs_iunlock);
+
+DECLARE_EVENT_CLASS(xfs_inode_class,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino)
+)
+
+#define DEFINE_INODE_EVENT(name) \
+DEFINE_EVENT(xfs_inode_class, name, \
+ TP_PROTO(struct xfs_inode *ip), \
+ TP_ARGS(ip))
+DEFINE_INODE_EVENT(xfs_iget_skip);
+DEFINE_INODE_EVENT(xfs_iget_reclaim);
+DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
+DEFINE_INODE_EVENT(xfs_iget_hit);
+DEFINE_INODE_EVENT(xfs_iget_miss);
+
+DEFINE_INODE_EVENT(xfs_getattr);
+DEFINE_INODE_EVENT(xfs_setattr);
+DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_alloc_file_space);
+DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_readdir);
+#ifdef CONFIG_XFS_POSIX_ACL
+DEFINE_INODE_EVENT(xfs_check_acl);
+#endif
+DEFINE_INODE_EVENT(xfs_vm_bmap);
+DEFINE_INODE_EVENT(xfs_file_ioctl);
+DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_file_fsync);
+DEFINE_INODE_EVENT(xfs_destroy_inode);
+DEFINE_INODE_EVENT(xfs_write_inode);
+DEFINE_INODE_EVENT(xfs_evict_inode);
+
+DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
+DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+
+DECLARE_EVENT_CLASS(xfs_iref_class,
+ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
+ TP_ARGS(ip, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, count)
+ __field(int, pincount)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->count = atomic_read(&VFS_I(ip)->i_count);
+ __entry->pincount = atomic_read(&ip->i_pincount);
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->count,
+ __entry->pincount,
+ (char *)__entry->caller_ip)
+)
+
+#define DEFINE_IREF_EVENT(name) \
+DEFINE_EVENT(xfs_iref_class, name, \
+ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
+ TP_ARGS(ip, caller_ip))
+DEFINE_IREF_EVENT(xfs_ihold);
+DEFINE_IREF_EVENT(xfs_irele);
+DEFINE_IREF_EVENT(xfs_inode_pin);
+DEFINE_IREF_EVENT(xfs_inode_unpin);
+DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
+
+DECLARE_EVENT_CLASS(xfs_namespace_class,
+ TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+ TP_ARGS(dp, name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dp_ino)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(dp)->i_sb->s_dev;
+ __entry->dp_ino = dp->i_ino;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d dp ino 0x%llx name %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dp_ino,
+ __get_str(name))
+)
+
+#define DEFINE_NAMESPACE_EVENT(name) \
+DEFINE_EVENT(xfs_namespace_class, name, \
+ TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
+ TP_ARGS(dp, name))
+DEFINE_NAMESPACE_EVENT(xfs_remove);
+DEFINE_NAMESPACE_EVENT(xfs_link);
+DEFINE_NAMESPACE_EVENT(xfs_lookup);
+DEFINE_NAMESPACE_EVENT(xfs_create);
+DEFINE_NAMESPACE_EVENT(xfs_symlink);
+
+TRACE_EVENT(xfs_rename,
+ TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
+ struct xfs_name *src_name, struct xfs_name *target_name),
+ TP_ARGS(src_dp, target_dp, src_name, target_name),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, src_dp_ino)
+ __field(xfs_ino_t, target_dp_ino)
+ __dynamic_array(char, src_name, src_name->len)
+ __dynamic_array(char, target_name, target_name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(src_dp)->i_sb->s_dev;
+ __entry->src_dp_ino = src_dp->i_ino;
+ __entry->target_dp_ino = target_dp->i_ino;
+ memcpy(__get_str(src_name), src_name->name, src_name->len);
+ memcpy(__get_str(target_name), target_name->name, target_name->len);
+ ),
+ TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
+ " src name %s target name %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->src_dp_ino,
+ __entry->target_dp_ino,
+ __get_str(src_name),
+ __get_str(target_name))
+)
+
+DECLARE_EVENT_CLASS(xfs_dquot_class,
+ TP_PROTO(struct xfs_dquot *dqp),
+ TP_ARGS(dqp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, id)
+ __field(unsigned, flags)
+ __field(unsigned, nrefs)
+ __field(unsigned long long, res_bcount)
+ __field(unsigned long long, bcount)
+ __field(unsigned long long, icount)
+ __field(unsigned long long, blk_hardlimit)
+ __field(unsigned long long, blk_softlimit)
+ __field(unsigned long long, ino_hardlimit)
+ __field(unsigned long long, ino_softlimit)
+ ), \
+ TP_fast_assign(
+ __entry->dev = dqp->q_mount->m_super->s_dev;
+ __entry->id = be32_to_cpu(dqp->q_core.d_id);
+ __entry->flags = dqp->dq_flags;
+ __entry->nrefs = dqp->q_nrefs;
+ __entry->res_bcount = dqp->q_res_bcount;
+ __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
+ __entry->icount = be64_to_cpu(dqp->q_core.d_icount);
+ __entry->blk_hardlimit =
+ be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+ __entry->blk_softlimit =
+ be64_to_cpu(dqp->q_core.d_blk_softlimit);
+ __entry->ino_hardlimit =
+ be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+ __entry->ino_softlimit =
+ be64_to_cpu(dqp->q_core.d_ino_softlimit);
+ ),
+ TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
+ "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
+ "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->id,
+ __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
+ __entry->nrefs,
+ __entry->res_bcount,
+ __entry->bcount,
+ __entry->blk_hardlimit,
+ __entry->blk_softlimit,
+ __entry->icount,
+ __entry->ino_hardlimit,
+ __entry->ino_softlimit)
+)
+
+#define DEFINE_DQUOT_EVENT(name) \
+DEFINE_EVENT(xfs_dquot_class, name, \
+ TP_PROTO(struct xfs_dquot *dqp), \
+ TP_ARGS(dqp))
+DEFINE_DQUOT_EVENT(xfs_dqadjust);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqattach_found);
+DEFINE_DQUOT_EVENT(xfs_dqattach_get);
+DEFINE_DQUOT_EVENT(xfs_dqinit);
+DEFINE_DQUOT_EVENT(xfs_dqreuse);
+DEFINE_DQUOT_EVENT(xfs_dqalloc);
+DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
+DEFINE_DQUOT_EVENT(xfs_dqread);
+DEFINE_DQUOT_EVENT(xfs_dqread_fail);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
+DEFINE_DQUOT_EVENT(xfs_dqget_hit);
+DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqput);
+DEFINE_DQUOT_EVENT(xfs_dqput_wait);
+DEFINE_DQUOT_EVENT(xfs_dqput_free);
+DEFINE_DQUOT_EVENT(xfs_dqrele);
+DEFINE_DQUOT_EVENT(xfs_dqflush);
+DEFINE_DQUOT_EVENT(xfs_dqflush_force);
+DEFINE_DQUOT_EVENT(xfs_dqflush_done);
+
+DECLARE_EVENT_CLASS(xfs_loggrant_class,
+ TP_PROTO(struct log *log, struct xlog_ticket *tic),
+ TP_ARGS(log, tic),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned, trans_type)
+ __field(char, ocnt)
+ __field(char, cnt)
+ __field(int, curr_res)
+ __field(int, unit_res)
+ __field(unsigned int, flags)
+ __field(int, reserveq)
+ __field(int, writeq)
+ __field(int, grant_reserve_cycle)
+ __field(int, grant_reserve_bytes)
+ __field(int, grant_write_cycle)
+ __field(int, grant_write_bytes)
+ __field(int, curr_cycle)
+ __field(int, curr_block)
+ __field(xfs_lsn_t, tail_lsn)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->trans_type = tic->t_trans_type;
+ __entry->ocnt = tic->t_ocnt;
+ __entry->cnt = tic->t_cnt;
+ __entry->curr_res = tic->t_curr_res;
+ __entry->unit_res = tic->t_unit_res;
+ __entry->flags = tic->t_flags;
+ __entry->reserveq = list_empty(&log->l_reserveq);
+ __entry->writeq = list_empty(&log->l_writeq);
+ xlog_crack_grant_head(&log->l_grant_reserve_head,
+ &__entry->grant_reserve_cycle,
+ &__entry->grant_reserve_bytes);
+ xlog_crack_grant_head(&log->l_grant_write_head,
+ &__entry->grant_write_cycle,
+ &__entry->grant_write_bytes);
+ __entry->curr_cycle = log->l_curr_cycle;
+ __entry->curr_block = log->l_curr_block;
+ __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
+ ),
+ TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+ "t_unit_res %u t_flags %s reserveq %s "
+ "writeq %s grant_reserve_cycle %d "
+ "grant_reserve_bytes %d grant_write_cycle %d "
+ "grant_write_bytes %d curr_cycle %d curr_block %d "
+ "tail_cycle %d tail_block %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
+ __entry->ocnt,
+ __entry->cnt,
+ __entry->curr_res,
+ __entry->unit_res,
+ __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
+ __entry->reserveq ? "empty" : "active",
+ __entry->writeq ? "empty" : "active",
+ __entry->grant_reserve_cycle,
+ __entry->grant_reserve_bytes,
+ __entry->grant_write_cycle,
+ __entry->grant_write_bytes,
+ __entry->curr_cycle,
+ __entry->curr_block,
+ CYCLE_LSN(__entry->tail_lsn),
+ BLOCK_LSN(__entry->tail_lsn)
+ )
+)
+
+#define DEFINE_LOGGRANT_EVENT(name) \
+DEFINE_EVENT(xfs_loggrant_class, name, \
+ TP_PROTO(struct log *log, struct xlog_ticket *tic), \
+ TP_ARGS(log, tic))
+DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
+DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
+DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+
+DECLARE_EVENT_CLASS(xfs_file_class,
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
+ TP_ARGS(ip, count, offset, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsize_t, size)
+ __field(xfs_fsize_t, new_size)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->new_size = ip->i_new_size;
+ __entry->offset = offset;
+ __entry->count = count;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ "offset 0x%llx count 0x%zx ioflags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->new_size,
+ __entry->offset,
+ __entry->count,
+ __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
+)
+
+#define DEFINE_RW_EVENT(name) \
+DEFINE_EVENT(xfs_file_class, name, \
+ TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+ TP_ARGS(ip, count, offset, flags))
+DEFINE_RW_EVENT(xfs_file_read);
+DEFINE_RW_EVENT(xfs_file_buffered_write);
+DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_splice_read);
+DEFINE_RW_EVENT(xfs_file_splice_write);
+
+DECLARE_EVENT_CLASS(xfs_page_class,
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
+ TP_ARGS(inode, page, off),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(pgoff_t, pgoff)
+ __field(loff_t, size)
+ __field(unsigned long, offset)
+ __field(int, delalloc)
+ __field(int, unwritten)
+ ),
+ TP_fast_assign(
+ int delalloc = -1, unwritten = -1;
+
+ if (page_has_buffers(page))
+ xfs_count_page_state(page, &delalloc, &unwritten);
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = XFS_I(inode)->i_ino;
+ __entry->pgoff = page_offset(page);
+ __entry->size = i_size_read(inode);
+ __entry->offset = off;
+ __entry->delalloc = delalloc;
+ __entry->unwritten = unwritten;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+ "delalloc %d unwritten %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pgoff,
+ __entry->size,
+ __entry->offset,
+ __entry->delalloc,
+ __entry->unwritten)
+)
+
+#define DEFINE_PAGE_EVENT(name) \
+DEFINE_EVENT(xfs_page_class, name, \
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
+ TP_ARGS(inode, page, off))
+DEFINE_PAGE_EVENT(xfs_writepage);
+DEFINE_PAGE_EVENT(xfs_releasepage);
+DEFINE_PAGE_EVENT(xfs_invalidatepage);
+
+DECLARE_EVENT_CLASS(xfs_imap_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
+ int type, struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, offset, count, type, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(loff_t, size)
+ __field(loff_t, new_size)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ __field(int, type)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_filblks_t, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->new_size = ip->i_new_size;
+ __entry->offset = offset;
+ __entry->count = count;
+ __entry->type = type;
+ __entry->startoff = irec ? irec->br_startoff : 0;
+ __entry->startblock = irec ? irec->br_startblock : 0;
+ __entry->blockcount = irec ? irec->br_blockcount : 0;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ "offset 0x%llx count %zd type %s "
+ "startoff 0x%llx startblock %lld blockcount 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->new_size,
+ __entry->offset,
+ __entry->count,
+ __print_symbolic(__entry->type, XFS_IO_TYPES),
+ __entry->startoff,
+ (__int64_t)__entry->startblock,
+ __entry->blockcount)
+)
+
+#define DEFINE_IOMAP_EVENT(name) \
+DEFINE_EVENT(xfs_imap_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+ int type, struct xfs_bmbt_irec *irec), \
+ TP_ARGS(ip, offset, count, type, irec))
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+ TP_ARGS(ip, offset, count),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(loff_t, size)
+ __field(loff_t, new_size)
+ __field(loff_t, offset)
+ __field(size_t, count)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->new_size = ip->i_new_size;
+ __entry->offset = offset;
+ __entry->count = count;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ "offset 0x%llx count %zd",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->new_size,
+ __entry->offset,
+ __entry->count)
+);
+
+#define DEFINE_SIMPLE_IO_EVENT(name) \
+DEFINE_EVENT(xfs_simple_io_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
+ TP_ARGS(ip, offset, count))
+DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
+DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
+
+
+TRACE_EVENT(xfs_itruncate_start,
+ TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size, int flag,
+ xfs_off_t toss_start, xfs_off_t toss_finish),
+ TP_ARGS(ip, new_size, flag, toss_start, toss_finish),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsize_t, size)
+ __field(xfs_fsize_t, new_size)
+ __field(xfs_off_t, toss_start)
+ __field(xfs_off_t, toss_finish)
+ __field(int, flag)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->new_size = new_size;
+ __entry->toss_start = toss_start;
+ __entry->toss_finish = toss_finish;
+ __entry->flag = flag;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx %s size 0x%llx new_size 0x%llx "
+ "toss start 0x%llx toss finish 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_flags(__entry->flag, "|", XFS_ITRUNC_FLAGS),
+ __entry->size,
+ __entry->new_size,
+ __entry->toss_start,
+ __entry->toss_finish)
+);
+
+DECLARE_EVENT_CLASS(xfs_itrunc_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
+ TP_ARGS(ip, new_size),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsize_t, size)
+ __field(xfs_fsize_t, new_size)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->new_size = new_size;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->new_size)
+)
+
+#define DEFINE_ITRUNC_EVENT(name) \
+DEFINE_EVENT(xfs_itrunc_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
+ TP_ARGS(ip, new_size))
+DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end);
+
+TRACE_EVENT(xfs_pagecache_inval,
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
+ TP_ARGS(ip, start, finish),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsize_t, size)
+ __field(xfs_off_t, start)
+ __field(xfs_off_t, finish)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->start = start;
+ __entry->finish = finish;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->start,
+ __entry->finish)
+);
+
+TRACE_EVENT(xfs_bunmap,
+ TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len,
+ int flags, unsigned long caller_ip),
+ TP_ARGS(ip, bno, len, flags, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsize_t, size)
+ __field(xfs_fileoff_t, bno)
+ __field(xfs_filblks_t, len)
+ __field(unsigned long, caller_ip)
+ __field(int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->size = ip->i_d.di_size;
+ __entry->bno = bno;
+ __entry->len = len;
+ __entry->caller_ip = caller_ip;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
+ "flags %s caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->bno,
+ __entry->len,
+ __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
+ (void *)__entry->caller_ip)
+
+);
+
+DECLARE_EVENT_CLASS(xfs_busy_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, agbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len)
+);
+#define DEFINE_BUSY_EVENT(name) \
+DEFINE_EVENT(xfs_busy_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len), \
+ TP_ARGS(mp, agno, agbno, len))
+DEFINE_BUSY_EVENT(xfs_alloc_busy);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
+
+TRACE_EVENT(xfs_alloc_busy_trim,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ xfs_agblock_t tbno, xfs_extlen_t tlen),
+ TP_ARGS(mp, agno, agbno, len, tbno, tlen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(xfs_agblock_t, tbno)
+ __field(xfs_extlen_t, tlen)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->tbno = tbno;
+ __entry->tlen = tlen;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->tbno,
+ __entry->tlen)
+);
+
+TRACE_EVENT(xfs_trans_commit_lsn,
+ TP_PROTO(struct xfs_trans *trans),
+ TP_ARGS(trans),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(struct xfs_trans *, tp)
+ __field(xfs_lsn_t, lsn)
+ ),
+ TP_fast_assign(
+ __entry->dev = trans->t_mountp->m_super->s_dev;
+ __entry->tp = trans;
+ __entry->lsn = trans->t_commit_lsn;
+ ),
+ TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->tp,
+ __entry->lsn)
+);
+
+TRACE_EVENT(xfs_agf,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
+ unsigned long caller_ip),
+ TP_ARGS(mp, agf, flags, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, flags)
+ __field(__u32, length)
+ __field(__u32, bno_root)
+ __field(__u32, cnt_root)
+ __field(__u32, bno_level)
+ __field(__u32, cnt_level)
+ __field(__u32, flfirst)
+ __field(__u32, fllast)
+ __field(__u32, flcount)
+ __field(__u32, freeblks)
+ __field(__u32, longest)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = be32_to_cpu(agf->agf_seqno),
+ __entry->flags = flags;
+ __entry->length = be32_to_cpu(agf->agf_length),
+ __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
+ __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
+ __entry->bno_level =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
+ __entry->cnt_level =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
+ __entry->flfirst = be32_to_cpu(agf->agf_flfirst),
+ __entry->fllast = be32_to_cpu(agf->agf_fllast),
+ __entry->flcount = be32_to_cpu(agf->agf_flcount),
+ __entry->freeblks = be32_to_cpu(agf->agf_freeblks),
+ __entry->longest = be32_to_cpu(agf->agf_longest);
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
+ "levels b %u c %u flfirst %u fllast %u flcount %u "
+ "freeblks %u longest %u caller %pf",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
+ __entry->length,
+ __entry->bno_root,
+ __entry->cnt_root,
+ __entry->bno_level,
+ __entry->cnt_level,
+ __entry->flfirst,
+ __entry->fllast,
+ __entry->flcount,
+ __entry->freeblks,
+ __entry->longest,
+ (void *)__entry->caller_ip)
+);
+
+TRACE_EVENT(xfs_free_extent,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+ xfs_extlen_t len, bool isfl, int haveleft, int haveright),
+ TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(int, isfl)
+ __field(int, haveleft)
+ __field(int, haveright)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->isfl = isfl;
+ __entry->haveleft = haveleft;
+ __entry->haveright = haveright;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->isfl,
+ __entry->haveleft ?
+ (__entry->haveright ? "both" : "left") :
+ (__entry->haveright ? "right" : "none"))
+
+);
+
+DECLARE_EVENT_CLASS(xfs_alloc_class,
+ TP_PROTO(struct xfs_alloc_arg *args),
+ TP_ARGS(args),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, minlen)
+ __field(xfs_extlen_t, maxlen)
+ __field(xfs_extlen_t, mod)
+ __field(xfs_extlen_t, prod)
+ __field(xfs_extlen_t, minleft)
+ __field(xfs_extlen_t, total)
+ __field(xfs_extlen_t, alignment)
+ __field(xfs_extlen_t, minalignslop)
+ __field(xfs_extlen_t, len)
+ __field(short, type)
+ __field(short, otype)
+ __field(char, wasdel)
+ __field(char, wasfromfl)
+ __field(char, isfl)
+ __field(char, userdata)
+ __field(xfs_fsblock_t, firstblock)
+ ),
+ TP_fast_assign(
+ __entry->dev = args->mp->m_super->s_dev;
+ __entry->agno = args->agno;
+ __entry->agbno = args->agbno;
+ __entry->minlen = args->minlen;
+ __entry->maxlen = args->maxlen;
+ __entry->mod = args->mod;
+ __entry->prod = args->prod;
+ __entry->minleft = args->minleft;
+ __entry->total = args->total;
+ __entry->alignment = args->alignment;
+ __entry->minalignslop = args->minalignslop;
+ __entry->len = args->len;
+ __entry->type = args->type;
+ __entry->otype = args->otype;
+ __entry->wasdel = args->wasdel;
+ __entry->wasfromfl = args->wasfromfl;
+ __entry->isfl = args->isfl;
+ __entry->userdata = args->userdata;
+ __entry->firstblock = args->firstblock;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
+ "prod %u minleft %u total %u alignment %u minalignslop %u "
+ "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
+ "userdata %d firstblock 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->minlen,
+ __entry->maxlen,
+ __entry->mod,
+ __entry->prod,
+ __entry->minleft,
+ __entry->total,
+ __entry->alignment,
+ __entry->minalignslop,
+ __entry->len,
+ __print_symbolic(__entry->type, XFS_ALLOC_TYPES),
+ __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
+ __entry->wasdel,
+ __entry->wasfromfl,
+ __entry->isfl,
+ __entry->userdata,
+ (unsigned long long)__entry->firstblock)
+)
+
+#define DEFINE_ALLOC_EVENT(name) \
+DEFINE_EVENT(xfs_alloc_class, name, \
+ TP_PROTO(struct xfs_alloc_arg *args), \
+ TP_ARGS(args))
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
+
+DECLARE_EVENT_CLASS(xfs_dir2_class,
+ TP_PROTO(struct xfs_da_args *args),
+ TP_ARGS(args),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __dynamic_array(char, name, args->namelen)
+ __field(int, namelen)
+ __field(xfs_dahash_t, hashval)
+ __field(xfs_ino_t, inumber)
+ __field(int, op_flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+ __entry->ino = args->dp->i_ino;
+ if (args->namelen)
+ memcpy(__get_str(name), args->name, args->namelen);
+ __entry->namelen = args->namelen;
+ __entry->hashval = args->hashval;
+ __entry->inumber = args->inumber;
+ __entry->op_flags = args->op_flags;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
+ "inumber 0x%llx op_flags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->namelen,
+ __entry->namelen ? __get_str(name) : NULL,
+ __entry->namelen,
+ __entry->hashval,
+ __entry->inumber,
+ __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+
+#define DEFINE_DIR2_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_class, name, \
+ TP_PROTO(struct xfs_da_args *args), \
+ TP_ARGS(args))
+DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_create);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_block_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_block_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_block_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_block_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node);
+DEFINE_DIR2_EVENT(xfs_dir2_node_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_node_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+
+DECLARE_EVENT_CLASS(xfs_dir2_space_class,
+ TP_PROTO(struct xfs_da_args *args, int idx),
+ TP_ARGS(args, idx),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, op_flags)
+ __field(int, idx)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+ __entry->ino = args->dp->i_ino;
+ __entry->op_flags = args->op_flags;
+ __entry->idx = idx;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+ __entry->idx)
+)
+
+#define DEFINE_DIR2_SPACE_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_space_class, name, \
+ TP_PROTO(struct xfs_da_args *args, int idx), \
+ TP_ARGS(args, idx))
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode);
+
+TRACE_EVENT(xfs_dir2_leafn_moveents,
+ TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
+ TP_ARGS(args, src_idx, dst_idx, count),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, op_flags)
+ __field(int, src_idx)
+ __field(int, dst_idx)
+ __field(int, count)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+ __entry->ino = args->dp->i_ino;
+ __entry->op_flags = args->op_flags;
+ __entry->src_idx = src_idx;
+ __entry->dst_idx = dst_idx;
+ __entry->count = count;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx op_flags %s "
+ "src_idx %d dst_idx %d count %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+ __entry->src_idx,
+ __entry->dst_idx,
+ __entry->count)
+);
+
+#define XFS_SWAPEXT_INODES \
+ { 0, "target" }, \
+ { 1, "temp" }
+
+#define XFS_INODE_FORMAT_STR \
+ { 0, "invalid" }, \
+ { 1, "local" }, \
+ { 2, "extent" }, \
+ { 3, "btree" }
+
+DECLARE_EVENT_CLASS(xfs_swap_extent_class,
+ TP_PROTO(struct xfs_inode *ip, int which),
+ TP_ARGS(ip, which),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, which)
+ __field(xfs_ino_t, ino)
+ __field(int, format)
+ __field(int, nex)
+ __field(int, max_nex)
+ __field(int, broot_size)
+ __field(int, fork_off)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->which = which;
+ __entry->ino = ip->i_ino;
+ __entry->format = ip->i_d.di_format;
+ __entry->nex = ip->i_d.di_nextents;
+ __entry->max_nex = ip->i_df.if_ext_max;
+ __entry->broot_size = ip->i_df.if_broot_bytes;
+ __entry->fork_off = XFS_IFORK_BOFF(ip);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+ "Max in-fork extents %d, broot size %d, fork offset %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
+ __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
+ __entry->nex,
+ __entry->max_nex,
+ __entry->broot_size,
+ __entry->fork_off)
+)
+
+#define DEFINE_SWAPEXT_EVENT(name) \
+DEFINE_EVENT(xfs_swap_extent_class, name, \
+ TP_PROTO(struct xfs_inode *ip, int which), \
+ TP_ARGS(ip, which))
+
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
+ TP_PROTO(struct log *log, struct xlog_recover *trans,
+ struct xlog_recover_item *item, int pass),
+ TP_ARGS(log, trans, item, pass),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, item)
+ __field(xlog_tid_t, tid)
+ __field(int, type)
+ __field(int, pass)
+ __field(int, count)
+ __field(int, total)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->item = (unsigned long)item;
+ __entry->tid = trans->r_log_tid;
+ __entry->type = ITEM_TYPE(item);
+ __entry->pass = pass;
+ __entry->count = item->ri_cnt;
+ __entry->total = item->ri_total;
+ ),
+ TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
+ "item region count/total %d/%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->tid,
+ __entry->pass,
+ (void *)__entry->item,
+ __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+ __entry->count,
+ __entry->total)
+)
+
+#define DEFINE_LOG_RECOVER_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_item_class, name, \
+ TP_PROTO(struct log *log, struct xlog_recover *trans, \
+ struct xlog_recover_item *item, int pass), \
+ TP_ARGS(log, trans, item, pass))
+
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
+ TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
+ TP_ARGS(log, buf_f),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(__int64_t, blkno)
+ __field(unsigned short, len)
+ __field(unsigned short, flags)
+ __field(unsigned short, size)
+ __field(unsigned int, map_size)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->blkno = buf_f->blf_blkno;
+ __entry->len = buf_f->blf_len;
+ __entry->flags = buf_f->blf_flags;
+ __entry->size = buf_f->blf_size;
+ __entry->map_size = buf_f->blf_map_size;
+ ),
+ TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+ "map_size %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->blkno,
+ __entry->len,
+ __entry->flags,
+ __entry->size,
+ __entry->map_size)
+)
+
+#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
+ TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
+ TP_ARGS(log, buf_f))
+
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
+ TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
+ TP_ARGS(log, in_f),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned short, size)
+ __field(int, fields)
+ __field(unsigned short, asize)
+ __field(unsigned short, dsize)
+ __field(__int64_t, blkno)
+ __field(int, len)
+ __field(int, boffset)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->ino = in_f->ilf_ino;
+ __entry->size = in_f->ilf_size;
+ __entry->fields = in_f->ilf_fields;
+ __entry->asize = in_f->ilf_asize;
+ __entry->dsize = in_f->ilf_dsize;
+ __entry->blkno = in_f->ilf_blkno;
+ __entry->len = in_f->ilf_len;
+ __entry->boffset = in_f->ilf_boffset;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
+ "dsize %d, blkno 0x%llx, len %d, boffset %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->fields,
+ __entry->asize,
+ __entry->dsize,
+ __entry->blkno,
+ __entry->len,
+ __entry->boffset)
+)
+#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
+ TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
+ TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+
+DECLARE_EVENT_CLASS(xfs_discard_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, agbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len)
+)
+
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len), \
+ TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
+
+#endif /* _TRACE_XFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE xfs_trace
+#include <trace/define_trace.h>
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
new file mode 100644
index 00000000..7c220b42
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_VNODE_H__
+#define __XFS_VNODE_H__
+
+#include "xfs_fs.h"
+
+struct file;
+struct xfs_inode;
+struct xfs_iomap;
+struct attrlist_cursor_kern;
+
+/*
+ * Return values for xfs_inactive. A return value of
+ * VN_INACTIVE_NOCACHE implies that the file system behavior
+ * has disassociated its state and bhv_desc_t from the vnode.
+ */
+#define VN_INACTIVE_CACHE 0
+#define VN_INACTIVE_NOCACHE 1
+
+/*
+ * Flags for read/write calls - same values as IRIX
+ */
+#define IO_ISDIRECT 0x00004 /* bypass page cache */
+#define IO_INVIS 0x00020 /* don't update inode timestamps */
+
+#define XFS_IO_FLAGS \
+ { IO_ISDIRECT, "DIRECT" }, \
+ { IO_INVIS, "INVIS"}
+
+/*
+ * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
+ */
+#define FI_NONE 0 /* none */
+#define FI_REMAPF 1 /* Do a remapf prior to the operation */
+#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation.
+ Prevent VM access to the pages until
+ the operation completes. */
+
+/*
+ * Some useful predicates.
+ */
+#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
+#define VN_CACHED(vp) (vp->i_mapping->nrpages)
+#define VN_DIRTY(vp) mapping_tagged(vp->i_mapping, \
+ PAGECACHE_TAG_DIRTY)
+
+
+#endif /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
new file mode 100644
index 00000000..87d3e038
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (C) 2008 Christoph Hellwig.
+ * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xfs.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_acl.h"
+#include "xfs_vnodeops.h"
+
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+
+static int
+xfs_xattr_get(struct dentry *dentry, const char *name,
+ void *value, size_t size, int xflags)
+{
+ struct xfs_inode *ip = XFS_I(dentry->d_inode);
+ int error, asize = size;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ /* Convert Linux syscall to XFS internal ATTR flags */
+ if (!size) {
+ xflags |= ATTR_KERNOVAL;
+ value = NULL;
+ }
+
+ error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+ if (error)
+ return error;
+ return asize;
+}
+
+static int
+xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags, int xflags)
+{
+ struct xfs_inode *ip = XFS_I(dentry->d_inode);
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ /* Convert Linux syscall to XFS internal ATTR flags */
+ if (flags & XATTR_CREATE)
+ xflags |= ATTR_CREATE;
+ if (flags & XATTR_REPLACE)
+ xflags |= ATTR_REPLACE;
+
+ if (!value)
+ return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
+ return -xfs_attr_set(ip, (unsigned char *)name,
+ (void *)value, size, xflags);
+}
+
+static const struct xattr_handler xfs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .flags = 0, /* no flags implies user namespace */
+ .get = xfs_xattr_get,
+ .set = xfs_xattr_set,
+};
+
+static const struct xattr_handler xfs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .flags = ATTR_ROOT,
+ .get = xfs_xattr_get,
+ .set = xfs_xattr_set,
+};
+
+static const struct xattr_handler xfs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .flags = ATTR_SECURE,
+ .get = xfs_xattr_get,
+ .set = xfs_xattr_set,
+};
+
+const struct xattr_handler *xfs_xattr_handlers[] = {
+ &xfs_xattr_user_handler,
+ &xfs_xattr_trusted_handler,
+ &xfs_xattr_security_handler,
+#ifdef CONFIG_XFS_POSIX_ACL
+ &xfs_xattr_acl_access_handler,
+ &xfs_xattr_acl_default_handler,
+#endif
+ NULL
+};
+
+static unsigned int xfs_xattr_prefix_len(int flags)
+{
+ if (flags & XFS_ATTR_SECURE)
+ return sizeof("security");
+ else if (flags & XFS_ATTR_ROOT)
+ return sizeof("trusted");
+ else
+ return sizeof("user");
+}
+
+static const char *xfs_xattr_prefix(int flags)
+{
+ if (flags & XFS_ATTR_SECURE)
+ return xfs_xattr_security_handler.prefix;
+ else if (flags & XFS_ATTR_ROOT)
+ return xfs_xattr_trusted_handler.prefix;
+ else
+ return xfs_xattr_user_handler.prefix;
+}
+
+static int
+xfs_xattr_put_listent(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ int valuelen,
+ unsigned char *value)
+{
+ unsigned int prefix_len = xfs_xattr_prefix_len(flags);
+ char *offset;
+ int arraytop;
+
+ ASSERT(context->count >= 0);
+
+ /*
+ * Only show root namespace entries if we are actually allowed to
+ * see them.
+ */
+ if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
+ return 0;
+
+ arraytop = context->count + prefix_len + namelen + 1;
+ if (arraytop > context->firstu) {
+ context->count = -1; /* insufficient space */
+ return 1;
+ }
+ offset = (char *)context->alist + context->count;
+ strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+ offset += prefix_len;
+ strncpy(offset, (char *)name, namelen); /* real name */
+ offset += namelen;
+ *offset = '\0';
+ context->count += prefix_len + namelen + 1;
+ return 0;
+}
+
+static int
+xfs_xattr_put_listent_sizes(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ int valuelen,
+ unsigned char *value)
+{
+ context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
+ return 0;
+}
+
+static int
+list_one_attr(const char *name, const size_t len, void *data,
+ size_t size, ssize_t *result)
+{
+ char *p = data + *result;
+
+ *result += len;
+ if (!size)
+ return 0;
+ if (*result > size)
+ return -ERANGE;
+
+ strcpy(p, name);
+ return 0;
+}
+
+ssize_t
+xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+{
+ struct xfs_attr_list_context context;
+ struct attrlist_cursor_kern cursor = { 0 };
+ struct inode *inode = dentry->d_inode;
+ int error;
+
+ /*
+ * First read the regular on-disk attributes.
+ */
+ memset(&context, 0, sizeof(context));
+ context.dp = XFS_I(inode);
+ context.cursor = &cursor;
+ context.resynch = 1;
+ context.alist = data;
+ context.bufsize = size;
+ context.firstu = context.bufsize;
+
+ if (size)
+ context.put_listent = xfs_xattr_put_listent;
+ else
+ context.put_listent = xfs_xattr_put_listent_sizes;
+
+ xfs_attr_list_int(&context);
+ if (context.count < 0)
+ return -ERANGE;
+
+ /*
+ * Then add the two synthetic ACL attributes.
+ */
+ if (posix_acl_access_exists(inode)) {
+ error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
+ strlen(POSIX_ACL_XATTR_ACCESS) + 1,
+ data, size, &context.count);
+ if (error)
+ return error;
+ }
+
+ if (posix_acl_default_exists(inode)) {
+ error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
+ strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
+ data, size, &context.count);
+ if (error)
+ return error;
+ }
+
+ return context.count;
+}
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
new file mode 100644
index 00000000..6fa21460
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -0,0 +1,1496 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+
+
+/*
+ LOCK ORDER
+
+ inode lock (ilock)
+ dquot hash-chain lock (hashlock)
+ xqm dquot freelist lock (freelistlock
+ mount's dquot list lock (mplistlock)
+ user dquot lock - lock ordering among dquots is based on the uid or gid
+ group dquot lock - similar to udquots. Between the two dquots, the udquot
+ has to be locked first.
+ pin lock - the dquot lock must be held to take this lock.
+ flush lock - ditto.
+*/
+
+#ifdef DEBUG
+xfs_buftarg_t *xfs_dqerror_target;
+int xfs_do_dqerror;
+int xfs_dqreq_num;
+int xfs_dqerror_mod = 33;
+#endif
+
+static struct lock_class_key xfs_dquot_other_class;
+
+/*
+ * Allocate and initialize a dquot. We don't always allocate fresh memory;
+ * we try to reclaim a free dquot if the number of incore dquots are above
+ * a threshold.
+ * The only field inside the core that gets initialized at this point
+ * is the d_id field. The idea is to fill in the entire q_core
+ * when we read in the on disk dquot.
+ */
+STATIC xfs_dquot_t *
+xfs_qm_dqinit(
+ xfs_mount_t *mp,
+ xfs_dqid_t id,
+ uint type)
+{
+ xfs_dquot_t *dqp;
+ boolean_t brandnewdquot;
+
+ brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
+ dqp->dq_flags = type;
+ dqp->q_core.d_id = cpu_to_be32(id);
+ dqp->q_mount = mp;
+
+ /*
+ * No need to re-initialize these if this is a reclaimed dquot.
+ */
+ if (brandnewdquot) {
+ INIT_LIST_HEAD(&dqp->q_freelist);
+ mutex_init(&dqp->q_qlock);
+ init_waitqueue_head(&dqp->q_pinwait);
+
+ /*
+ * Because we want to use a counting completion, complete
+ * the flush completion once to allow a single access to
+ * the flush completion without blocking.
+ */
+ init_completion(&dqp->q_flush);
+ complete(&dqp->q_flush);
+
+ trace_xfs_dqinit(dqp);
+ } else {
+ /*
+ * Only the q_core portion was zeroed in dqreclaim_one().
+ * So, we need to reset others.
+ */
+ dqp->q_nrefs = 0;
+ dqp->q_blkno = 0;
+ INIT_LIST_HEAD(&dqp->q_mplist);
+ INIT_LIST_HEAD(&dqp->q_hashlist);
+ dqp->q_bufoffset = 0;
+ dqp->q_fileoffset = 0;
+ dqp->q_transp = NULL;
+ dqp->q_gdquot = NULL;
+ dqp->q_res_bcount = 0;
+ dqp->q_res_icount = 0;
+ dqp->q_res_rtbcount = 0;
+ atomic_set(&dqp->q_pincount, 0);
+ dqp->q_hash = NULL;
+ ASSERT(list_empty(&dqp->q_freelist));
+
+ trace_xfs_dqreuse(dqp);
+ }
+
+ /*
+ * In either case we need to make sure group quotas have a different
+ * lock class than user quotas, to make sure lockdep knows we can
+ * locks of one of each at the same time.
+ */
+ if (!(type & XFS_DQ_USER))
+ lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+
+ /*
+ * log item gets initialized later
+ */
+ return (dqp);
+}
+
+/*
+ * This is called to free all the memory associated with a dquot
+ */
+void
+xfs_qm_dqdestroy(
+ xfs_dquot_t *dqp)
+{
+ ASSERT(list_empty(&dqp->q_freelist));
+
+ mutex_destroy(&dqp->q_qlock);
+ kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
+
+ atomic_dec(&xfs_Gqm->qm_totaldquots);
+}
+
+/*
+ * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
+ */
+STATIC void
+xfs_qm_dqinit_core(
+ xfs_dqid_t id,
+ uint type,
+ xfs_dqblk_t *d)
+{
+ /*
+ * Caller has zero'd the entire dquot 'chunk' already.
+ */
+ d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+ d->dd_diskdq.d_id = cpu_to_be32(id);
+ d->dd_diskdq.d_flags = type;
+}
+
+/*
+ * If default limits are in force, push them into the dquot now.
+ * We overwrite the dquot limits only if they are zero and this
+ * is not the root dquot.
+ */
+void
+xfs_qm_adjust_dqlimits(
+ xfs_mount_t *mp,
+ xfs_disk_dquot_t *d)
+{
+ xfs_quotainfo_t *q = mp->m_quotainfo;
+
+ ASSERT(d->d_id);
+
+ if (q->qi_bsoftlimit && !d->d_blk_softlimit)
+ d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
+ if (q->qi_bhardlimit && !d->d_blk_hardlimit)
+ d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+ if (q->qi_isoftlimit && !d->d_ino_softlimit)
+ d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
+ if (q->qi_ihardlimit && !d->d_ino_hardlimit)
+ d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
+ if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
+ d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
+ if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
+ d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+}
+
+/*
+ * Check the limits and timers of a dquot and start or reset timers
+ * if necessary.
+ * This gets called even when quota enforcement is OFF, which makes our
+ * life a little less complicated. (We just don't reject any quota
+ * reservations in that case, when enforcement is off).
+ * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
+ * enforcement's off.
+ * In contrast, warnings are a little different in that they don't
+ * 'automatically' get started when limits get exceeded. They do
+ * get reset to zero, however, when we find the count to be under
+ * the soft limit (they are only ever set non-zero via userspace).
+ */
+void
+xfs_qm_adjust_dqtimers(
+ xfs_mount_t *mp,
+ xfs_disk_dquot_t *d)
+{
+ ASSERT(d->d_id);
+
+#ifdef QUOTADEBUG
+ if (d->d_blk_hardlimit)
+ ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
+ be64_to_cpu(d->d_blk_hardlimit));
+ if (d->d_ino_hardlimit)
+ ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
+ be64_to_cpu(d->d_ino_hardlimit));
+ if (d->d_rtb_hardlimit)
+ ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
+ be64_to_cpu(d->d_rtb_hardlimit));
+#endif
+ if (!d->d_btimer) {
+ if ((d->d_blk_softlimit &&
+ (be64_to_cpu(d->d_bcount) >=
+ be64_to_cpu(d->d_blk_softlimit))) ||
+ (d->d_blk_hardlimit &&
+ (be64_to_cpu(d->d_bcount) >=
+ be64_to_cpu(d->d_blk_hardlimit)))) {
+ d->d_btimer = cpu_to_be32(get_seconds() +
+ mp->m_quotainfo->qi_btimelimit);
+ } else {
+ d->d_bwarns = 0;
+ }
+ } else {
+ if ((!d->d_blk_softlimit ||
+ (be64_to_cpu(d->d_bcount) <
+ be64_to_cpu(d->d_blk_softlimit))) &&
+ (!d->d_blk_hardlimit ||
+ (be64_to_cpu(d->d_bcount) <
+ be64_to_cpu(d->d_blk_hardlimit)))) {
+ d->d_btimer = 0;
+ }
+ }
+
+ if (!d->d_itimer) {
+ if ((d->d_ino_softlimit &&
+ (be64_to_cpu(d->d_icount) >=
+ be64_to_cpu(d->d_ino_softlimit))) ||
+ (d->d_ino_hardlimit &&
+ (be64_to_cpu(d->d_icount) >=
+ be64_to_cpu(d->d_ino_hardlimit)))) {
+ d->d_itimer = cpu_to_be32(get_seconds() +
+ mp->m_quotainfo->qi_itimelimit);
+ } else {
+ d->d_iwarns = 0;
+ }
+ } else {
+ if ((!d->d_ino_softlimit ||
+ (be64_to_cpu(d->d_icount) <
+ be64_to_cpu(d->d_ino_softlimit))) &&
+ (!d->d_ino_hardlimit ||
+ (be64_to_cpu(d->d_icount) <
+ be64_to_cpu(d->d_ino_hardlimit)))) {
+ d->d_itimer = 0;
+ }
+ }
+
+ if (!d->d_rtbtimer) {
+ if ((d->d_rtb_softlimit &&
+ (be64_to_cpu(d->d_rtbcount) >=
+ be64_to_cpu(d->d_rtb_softlimit))) ||
+ (d->d_rtb_hardlimit &&
+ (be64_to_cpu(d->d_rtbcount) >=
+ be64_to_cpu(d->d_rtb_hardlimit)))) {
+ d->d_rtbtimer = cpu_to_be32(get_seconds() +
+ mp->m_quotainfo->qi_rtbtimelimit);
+ } else {
+ d->d_rtbwarns = 0;
+ }
+ } else {
+ if ((!d->d_rtb_softlimit ||
+ (be64_to_cpu(d->d_rtbcount) <
+ be64_to_cpu(d->d_rtb_softlimit))) &&
+ (!d->d_rtb_hardlimit ||
+ (be64_to_cpu(d->d_rtbcount) <
+ be64_to_cpu(d->d_rtb_hardlimit)))) {
+ d->d_rtbtimer = 0;
+ }
+ }
+}
+
+/*
+ * initialize a buffer full of dquots and log the whole thing
+ */
+STATIC void
+xfs_qm_init_dquot_blk(
+ xfs_trans_t *tp,
+ xfs_mount_t *mp,
+ xfs_dqid_t id,
+ uint type,
+ xfs_buf_t *bp)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ xfs_dqblk_t *d;
+ int curid, i;
+
+ ASSERT(tp);
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+ d = (xfs_dqblk_t *)XFS_BUF_PTR(bp);
+
+ /*
+ * ID of the first dquot in the block - id's are zero based.
+ */
+ curid = id - (id % q->qi_dqperchunk);
+ ASSERT(curid >= 0);
+ memset(d, 0, BBTOB(q->qi_dqchunklen));
+ for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
+ xfs_qm_dqinit_core(curid, type, d);
+ xfs_trans_dquot_buf(tp, bp,
+ (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
+ ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
+ XFS_BLF_GDQUOT_BUF)));
+ xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
+}
+
+
+
+/*
+ * Allocate a block and fill it with dquots.
+ * This is called when the bmapi finds a hole.
+ */
+STATIC int
+xfs_qm_dqalloc(
+ xfs_trans_t **tpp,
+ xfs_mount_t *mp,
+ xfs_dquot_t *dqp,
+ xfs_inode_t *quotip,
+ xfs_fileoff_t offset_fsb,
+ xfs_buf_t **O_bpp)
+{
+ xfs_fsblock_t firstblock;
+ xfs_bmap_free_t flist;
+ xfs_bmbt_irec_t map;
+ int nmaps, error, committed;
+ xfs_buf_t *bp;
+ xfs_trans_t *tp = *tpp;
+
+ ASSERT(tp != NULL);
+
+ trace_xfs_dqalloc(dqp);
+
+ /*
+ * Initialize the bmap freelist prior to calling bmapi code.
+ */
+ xfs_bmap_init(&flist, &firstblock);
+ xfs_ilock(quotip, XFS_ILOCK_EXCL);
+ /*
+ * Return if this type of quotas is turned off while we didn't
+ * have an inode lock
+ */
+ if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+ xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+ return (ESRCH);
+ }
+
+ xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
+ nmaps = 1;
+ if ((error = xfs_bmapi(tp, quotip,
+ offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
+ XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
+ &firstblock,
+ XFS_QM_DQALLOC_SPACE_RES(mp),
+ &map, &nmaps, &flist))) {
+ goto error0;
+ }
+ ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
+ ASSERT(nmaps == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ /*
+ * Keep track of the blkno to save a lookup later
+ */
+ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+ /* now we can just get the buffer (there's nothing to read yet) */
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen,
+ 0);
+ if (!bp || (error = XFS_BUF_GETERROR(bp)))
+ goto error1;
+ /*
+ * Make a chunk of dquots out of this buffer and log
+ * the entire thing.
+ */
+ xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
+ dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
+
+ /*
+ * xfs_bmap_finish() may commit the current transaction and
+ * start a second transaction if the freelist is not empty.
+ *
+ * Since we still want to modify this buffer, we need to
+ * ensure that the buffer is not released on commit of
+ * the first transaction and ensure the buffer is added to the
+ * second transaction.
+ *
+ * If there is only one transaction then don't stop the buffer
+ * from being released when it commits later on.
+ */
+
+ xfs_trans_bhold(tp, bp);
+
+ if ((error = xfs_bmap_finish(tpp, &flist, &committed))) {
+ goto error1;
+ }
+
+ if (committed) {
+ tp = *tpp;
+ xfs_trans_bjoin(tp, bp);
+ } else {
+ xfs_trans_bhold_release(tp, bp);
+ }
+
+ *O_bpp = bp;
+ return 0;
+
+ error1:
+ xfs_bmap_cancel(&flist);
+ error0:
+ xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+
+ return (error);
+}
+
+/*
+ * Maps a dquot to the buffer containing its on-disk version.
+ * This returns a ptr to the buffer containing the on-disk dquot
+ * in the bpp param, and a ptr to the on-disk dquot within that buffer
+ */
+STATIC int
+xfs_qm_dqtobp(
+ xfs_trans_t **tpp,
+ xfs_dquot_t *dqp,
+ xfs_disk_dquot_t **O_ddpp,
+ xfs_buf_t **O_bpp,
+ uint flags)
+{
+ xfs_bmbt_irec_t map;
+ int nmaps = 1, error;
+ xfs_buf_t *bp;
+ xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
+ xfs_mount_t *mp = dqp->q_mount;
+ xfs_disk_dquot_t *ddq;
+ xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
+ xfs_trans_t *tp = (tpp ? *tpp : NULL);
+
+ dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+
+ xfs_ilock(quotip, XFS_ILOCK_SHARED);
+ if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+ /*
+ * Return if this type of quotas is turned off while we
+ * didn't have the quota inode lock.
+ */
+ xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+ return ESRCH;
+ }
+
+ /*
+ * Find the block map; no allocations yet
+ */
+ error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+ XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+ NULL, 0, &map, &nmaps, NULL);
+
+ xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+ if (error)
+ return error;
+
+ ASSERT(nmaps == 1);
+ ASSERT(map.br_blockcount == 1);
+
+ /*
+ * Offset of dquot in the (fixed sized) dquot chunk.
+ */
+ dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+ sizeof(xfs_dqblk_t);
+
+ ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+ if (map.br_startblock == HOLESTARTBLOCK) {
+ /*
+ * We don't allocate unless we're asked to
+ */
+ if (!(flags & XFS_QMOPT_DQALLOC))
+ return ENOENT;
+
+ ASSERT(tp);
+ error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
+ dqp->q_fileoffset, &bp);
+ if (error)
+ return error;
+ tp = *tpp;
+ } else {
+ trace_xfs_dqtobp_read(dqp);
+
+ /*
+ * store the blkno etc so that we don't have to do the
+ * mapping all the time
+ */
+ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen,
+ 0, &bp);
+ if (error || !bp)
+ return XFS_ERROR(error);
+ }
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+ /*
+ * calculate the location of the dquot inside the buffer.
+ */
+ ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+
+ /*
+ * A simple sanity check in case we got a corrupted dquot...
+ */
+ error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
+ flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
+ "dqtobp");
+ if (error) {
+ if (!(flags & XFS_QMOPT_DQREPAIR)) {
+ xfs_trans_brelse(tp, bp);
+ return XFS_ERROR(EIO);
+ }
+ XFS_BUF_BUSY(bp); /* We dirtied this */
+ }
+
+ *O_bpp = bp;
+ *O_ddpp = ddq;
+
+ return (0);
+}
+
+
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately.
+ *
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqread(
+ xfs_trans_t **tpp,
+ xfs_dqid_t id,
+ xfs_dquot_t *dqp, /* dquot to get filled in */
+ uint flags)
+{
+ xfs_disk_dquot_t *ddqp;
+ xfs_buf_t *bp;
+ int error;
+ xfs_trans_t *tp;
+
+ ASSERT(tpp);
+
+ trace_xfs_dqread(dqp);
+
+ /*
+ * get a pointer to the on-disk dquot and the buffer containing it
+ * dqp already knows its own type (GROUP/USER).
+ */
+ if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
+ return (error);
+ }
+ tp = *tpp;
+
+ /* copy everything from disk dquot to the incore dquot */
+ memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
+ ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
+ xfs_qm_dquot_logitem_init(dqp);
+
+ /*
+ * Reservation counters are defined as reservation plus current usage
+ * to avoid having to add every time.
+ */
+ dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
+ dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
+ dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
+
+ /* Mark the buf so that this will stay incore a little longer */
+ XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
+
+ /*
+ * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
+ * So we need to release with xfs_trans_brelse().
+ * The strategy here is identical to that of inodes; we lock
+ * the dquot in xfs_qm_dqget() before making it accessible to
+ * others. This is because dquots, like inodes, need a good level of
+ * concurrency, and we don't want to take locks on the entire buffers
+ * for dquot accesses.
+ * Note also that the dquot buffer may even be dirty at this point, if
+ * this particular dquot was repaired. We still aren't afraid to
+ * brelse it because we have the changes incore.
+ */
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+ xfs_trans_brelse(tp, bp);
+
+ return (error);
+}
+
+
+/*
+ * allocate an incore dquot from the kernel heap,
+ * and fill its core with quota information kept on disk.
+ * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
+ * if it wasn't already allocated.
+ */
+STATIC int
+xfs_qm_idtodq(
+ xfs_mount_t *mp,
+ xfs_dqid_t id, /* gid or uid, depending on type */
+ uint type, /* UDQUOT or GDQUOT */
+ uint flags, /* DQALLOC, DQREPAIR */
+ xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */
+{
+ xfs_dquot_t *dqp;
+ int error;
+ xfs_trans_t *tp;
+ int cancelflags=0;
+
+ dqp = xfs_qm_dqinit(mp, id, type);
+ tp = NULL;
+ if (flags & XFS_QMOPT_DQALLOC) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
+ error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
+ XFS_WRITE_LOG_RES(mp) +
+ BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
+ 128,
+ 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+ if (error) {
+ cancelflags = 0;
+ goto error0;
+ }
+ cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+ }
+
+ /*
+ * Read it from disk; xfs_dqread() takes care of
+ * all the necessary initialization of dquot's fields (locks, etc)
+ */
+ if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) {
+ /*
+ * This can happen if quotas got turned off (ESRCH),
+ * or if the dquot didn't exist on disk and we ask to
+ * allocate (ENOENT).
+ */
+ trace_xfs_dqread_fail(dqp);
+ cancelflags |= XFS_TRANS_ABORT;
+ goto error0;
+ }
+ if (tp) {
+ if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES)))
+ goto error1;
+ }
+
+ *O_dqpp = dqp;
+ return (0);
+
+ error0:
+ ASSERT(error);
+ if (tp)
+ xfs_trans_cancel(tp, cancelflags);
+ error1:
+ xfs_qm_dqdestroy(dqp);
+ *O_dqpp = NULL;
+ return (error);
+}
+
+/*
+ * Lookup a dquot in the incore dquot hashtable. We keep two separate
+ * hashtables for user and group dquots; and, these are global tables
+ * inside the XQM, not per-filesystem tables.
+ * The hash chain must be locked by caller, and it is left locked
+ * on return. Returning dquot is locked.
+ */
+STATIC int
+xfs_qm_dqlookup(
+ xfs_mount_t *mp,
+ xfs_dqid_t id,
+ xfs_dqhash_t *qh,
+ xfs_dquot_t **O_dqpp)
+{
+ xfs_dquot_t *dqp;
+ uint flist_locked;
+
+ ASSERT(mutex_is_locked(&qh->qh_lock));
+
+ flist_locked = B_FALSE;
+
+ /*
+ * Traverse the hashchain looking for a match
+ */
+ list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
+ /*
+ * We already have the hashlock. We don't need the
+ * dqlock to look at the id field of the dquot, since the
+ * id can't be modified without the hashlock anyway.
+ */
+ if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) {
+ trace_xfs_dqlookup_found(dqp);
+
+ /*
+ * All in core dquots must be on the dqlist of mp
+ */
+ ASSERT(!list_empty(&dqp->q_mplist));
+
+ xfs_dqlock(dqp);
+ if (dqp->q_nrefs == 0) {
+ ASSERT(!list_empty(&dqp->q_freelist));
+ if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
+ trace_xfs_dqlookup_want(dqp);
+
+ /*
+ * We may have raced with dqreclaim_one()
+ * (and lost). So, flag that we don't
+ * want the dquot to be reclaimed.
+ */
+ dqp->dq_flags |= XFS_DQ_WANT;
+ xfs_dqunlock(dqp);
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+ xfs_dqlock(dqp);
+ dqp->dq_flags &= ~(XFS_DQ_WANT);
+ }
+ flist_locked = B_TRUE;
+ }
+
+ /*
+ * id couldn't have changed; we had the hashlock all
+ * along
+ */
+ ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
+
+ if (flist_locked) {
+ if (dqp->q_nrefs != 0) {
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+ flist_locked = B_FALSE;
+ } else {
+ /* take it off the freelist */
+ trace_xfs_dqlookup_freelist(dqp);
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
+ }
+ }
+
+ XFS_DQHOLD(dqp);
+
+ if (flist_locked)
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+ /*
+ * move the dquot to the front of the hashchain
+ */
+ ASSERT(mutex_is_locked(&qh->qh_lock));
+ list_move(&dqp->q_hashlist, &qh->qh_list);
+ trace_xfs_dqlookup_done(dqp);
+ *O_dqpp = dqp;
+ return 0;
+ }
+ }
+
+ *O_dqpp = NULL;
+ ASSERT(mutex_is_locked(&qh->qh_lock));
+ return (1);
+}
+
+/*
+ * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
+ * a locked dquot, doing an allocation (if requested) as needed.
+ * When both an inode and an id are given, the inode's id takes precedence.
+ * That is, if the id changes while we don't hold the ilock inside this
+ * function, the new dquot is returned, not necessarily the one requested
+ * in the id argument.
+ */
+int
+xfs_qm_dqget(
+ xfs_mount_t *mp,
+ xfs_inode_t *ip, /* locked inode (optional) */
+ xfs_dqid_t id, /* uid/projid/gid depending on type */
+ uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
+ uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
+ xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
+{
+ xfs_dquot_t *dqp;
+ xfs_dqhash_t *h;
+ uint version;
+ int error;
+
+ ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+ if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
+ (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
+ (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
+ return (ESRCH);
+ }
+ h = XFS_DQ_HASH(mp, id, type);
+
+#ifdef DEBUG
+ if (xfs_do_dqerror) {
+ if ((xfs_dqerror_target == mp->m_ddev_targp) &&
+ (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
+ xfs_debug(mp, "Returning error in dqget");
+ return (EIO);
+ }
+ }
+#endif
+
+ again:
+
+#ifdef DEBUG
+ ASSERT(type == XFS_DQ_USER ||
+ type == XFS_DQ_PROJ ||
+ type == XFS_DQ_GROUP);
+ if (ip) {
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (type == XFS_DQ_USER)
+ ASSERT(ip->i_udquot == NULL);
+ else
+ ASSERT(ip->i_gdquot == NULL);
+ }
+#endif
+ mutex_lock(&h->qh_lock);
+
+ /*
+ * Look in the cache (hashtable).
+ * The chain is kept locked during lookup.
+ */
+ if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
+ XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
+ /*
+ * The dquot was found, moved to the front of the chain,
+ * taken off the freelist if it was on it, and locked
+ * at this point. Just unlock the hashchain and return.
+ */
+ ASSERT(*O_dqpp);
+ ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
+ mutex_unlock(&h->qh_lock);
+ trace_xfs_dqget_hit(*O_dqpp);
+ return (0); /* success */
+ }
+ XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
+
+ /*
+ * Dquot cache miss. We don't want to keep the inode lock across
+ * a (potential) disk read. Also we don't want to deal with the lock
+ * ordering between quotainode and this inode. OTOH, dropping the inode
+ * lock here means dealing with a chown that can happen before
+ * we re-acquire the lock.
+ */
+ if (ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /*
+ * Save the hashchain version stamp, and unlock the chain, so that
+ * we don't keep the lock across a disk read
+ */
+ version = h->qh_version;
+ mutex_unlock(&h->qh_lock);
+
+ /*
+ * Allocate the dquot on the kernel heap, and read the ondisk
+ * portion off the disk. Also, do all the necessary initialization
+ * This can return ENOENT if dquot didn't exist on disk and we didn't
+ * ask it to allocate; ESRCH if quotas got turned off suddenly.
+ */
+ if ((error = xfs_qm_idtodq(mp, id, type,
+ flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
+ XFS_QMOPT_DOWARN),
+ &dqp))) {
+ if (ip)
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ return (error);
+ }
+
+ /*
+ * See if this is mount code calling to look at the overall quota limits
+ * which are stored in the id == 0 user or group's dquot.
+ * Since we may not have done a quotacheck by this point, just return
+ * the dquot without attaching it to any hashtables, lists, etc, or even
+ * taking a reference.
+ * The caller must dqdestroy this once done.
+ */
+ if (flags & XFS_QMOPT_DQSUSER) {
+ ASSERT(id == 0);
+ ASSERT(! ip);
+ goto dqret;
+ }
+
+ /*
+ * Dquot lock comes after hashlock in the lock ordering
+ */
+ if (ip) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ /*
+ * A dquot could be attached to this inode by now, since
+ * we had dropped the ilock.
+ */
+ if (type == XFS_DQ_USER) {
+ if (!XFS_IS_UQUOTA_ON(mp)) {
+ /* inode stays locked on return */
+ xfs_qm_dqdestroy(dqp);
+ return XFS_ERROR(ESRCH);
+ }
+ if (ip->i_udquot) {
+ xfs_qm_dqdestroy(dqp);
+ dqp = ip->i_udquot;
+ xfs_dqlock(dqp);
+ goto dqret;
+ }
+ } else {
+ if (!XFS_IS_OQUOTA_ON(mp)) {
+ /* inode stays locked on return */
+ xfs_qm_dqdestroy(dqp);
+ return XFS_ERROR(ESRCH);
+ }
+ if (ip->i_gdquot) {
+ xfs_qm_dqdestroy(dqp);
+ dqp = ip->i_gdquot;
+ xfs_dqlock(dqp);
+ goto dqret;
+ }
+ }
+ }
+
+ /*
+ * Hashlock comes after ilock in lock order
+ */
+ mutex_lock(&h->qh_lock);
+ if (version != h->qh_version) {
+ xfs_dquot_t *tmpdqp;
+ /*
+ * Now, see if somebody else put the dquot in the
+ * hashtable before us. This can happen because we didn't
+ * keep the hashchain lock. We don't have to worry about
+ * lock order between the two dquots here since dqp isn't
+ * on any findable lists yet.
+ */
+ if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
+ /*
+ * Duplicate found. Just throw away the new dquot
+ * and start over.
+ */
+ xfs_qm_dqput(tmpdqp);
+ mutex_unlock(&h->qh_lock);
+ xfs_qm_dqdestroy(dqp);
+ XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
+ goto again;
+ }
+ }
+
+ /*
+ * Put the dquot at the beginning of the hash-chain and mp's list
+ * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
+ */
+ ASSERT(mutex_is_locked(&h->qh_lock));
+ dqp->q_hash = h;
+ list_add(&dqp->q_hashlist, &h->qh_list);
+ h->qh_version++;
+
+ /*
+ * Attach this dquot to this filesystem's list of all dquots,
+ * kept inside the mount structure in m_quotainfo field
+ */
+ mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
+
+ /*
+ * We return a locked dquot to the caller, with a reference taken
+ */
+ xfs_dqlock(dqp);
+ dqp->q_nrefs = 1;
+
+ list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
+ mp->m_quotainfo->qi_dquots++;
+ mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+ mutex_unlock(&h->qh_lock);
+ dqret:
+ ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ trace_xfs_dqget_miss(dqp);
+ *O_dqpp = dqp;
+ return (0);
+}
+
+
+/*
+ * Release a reference to the dquot (decrement ref-count)
+ * and unlock it. If there is a group quota attached to this
+ * dquot, carefully release that too without tripping over
+ * deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+ xfs_dquot_t *dqp)
+{
+ xfs_dquot_t *gdqp;
+
+ ASSERT(dqp->q_nrefs > 0);
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+ trace_xfs_dqput(dqp);
+
+ if (dqp->q_nrefs != 1) {
+ dqp->q_nrefs--;
+ xfs_dqunlock(dqp);
+ return;
+ }
+
+ /*
+ * drop the dqlock and acquire the freelist and dqlock
+ * in the right order; but try to get it out-of-order first
+ */
+ if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
+ trace_xfs_dqput_wait(dqp);
+ xfs_dqunlock(dqp);
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+ xfs_dqlock(dqp);
+ }
+
+ while (1) {
+ gdqp = NULL;
+
+ /* We can't depend on nrefs being == 1 here */
+ if (--dqp->q_nrefs == 0) {
+ trace_xfs_dqput_free(dqp);
+
+ list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
+ xfs_Gqm->qm_dqfrlist_cnt++;
+
+ /*
+ * If we just added a udquot to the freelist, then
+ * we want to release the gdquot reference that
+ * it (probably) has. Otherwise it'll keep the
+ * gdquot from getting reclaimed.
+ */
+ if ((gdqp = dqp->q_gdquot)) {
+ /*
+ * Avoid a recursive dqput call
+ */
+ xfs_dqlock(gdqp);
+ dqp->q_gdquot = NULL;
+ }
+ }
+ xfs_dqunlock(dqp);
+
+ /*
+ * If we had a group quota inside the user quota as a hint,
+ * release it now.
+ */
+ if (! gdqp)
+ break;
+ dqp = gdqp;
+ }
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+}
+
+/*
+ * Release a dquot. Flush it if dirty, then dqput() it.
+ * dquot must not be locked.
+ */
+void
+xfs_qm_dqrele(
+ xfs_dquot_t *dqp)
+{
+ if (!dqp)
+ return;
+
+ trace_xfs_dqrele(dqp);
+
+ xfs_dqlock(dqp);
+ /*
+ * We don't care to flush it if the dquot is dirty here.
+ * That will create stutters that we want to avoid.
+ * Instead we do a delayed write when we try to reclaim
+ * a dirty dquot. Also xfs_sync will take part of the burden...
+ */
+ xfs_qm_dqput(dqp);
+}
+
+/*
+ * This is the dquot flushing I/O completion routine. It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk. It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+STATIC void
+xfs_qm_dqflush_done(
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip)
+{
+ xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip;
+ xfs_dquot_t *dqp = qip->qli_dquot;
+ struct xfs_ail *ailp = lip->li_ailp;
+
+ /*
+ * We only want to pull the item from the AIL if its
+ * location in the log has not changed since we started the flush.
+ * Thus, we only bother if the dquot's lsn has
+ * not changed. First we check the lsn outside the lock
+ * since it's cheaper, and then we recheck while
+ * holding the lock before removing the dquot from the AIL.
+ */
+ if ((lip->li_flags & XFS_LI_IN_AIL) &&
+ lip->li_lsn == qip->qli_flush_lsn) {
+
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ spin_lock(&ailp->xa_lock);
+ if (lip->li_lsn == qip->qli_flush_lsn)
+ xfs_trans_ail_delete(ailp, lip);
+ else
+ spin_unlock(&ailp->xa_lock);
+ }
+
+ /*
+ * Release the dq's flush lock since we're done with it.
+ */
+ xfs_dqfunlock(dqp);
+}
+
+/*
+ * Write a modified dquot to disk.
+ * The dquot must be locked and the flush lock too taken by caller.
+ * The flush lock will not be unlocked until the dquot reaches the disk,
+ * but the dquot is free to be unlocked and modified by the caller
+ * in the interim. Dquot is still locked on return. This behavior is
+ * identical to that of inodes.
+ */
+int
+xfs_qm_dqflush(
+ xfs_dquot_t *dqp,
+ uint flags)
+{
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_buf *bp;
+ struct xfs_disk_dquot *ddqp;
+ int error;
+
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ ASSERT(!completion_done(&dqp->q_flush));
+
+ trace_xfs_dqflush(dqp);
+
+ /*
+ * If not dirty, or it's pinned and we are not supposed to block, nada.
+ */
+ if (!XFS_DQ_IS_DIRTY(dqp) ||
+ (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
+ xfs_dqfunlock(dqp);
+ return 0;
+ }
+ xfs_qm_dqunpin_wait(dqp);
+
+ /*
+ * This may have been unpinned because the filesystem is shutting
+ * down forcibly. If that's the case we must not write this dquot
+ * to disk, because the log record didn't make it to disk!
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ dqp->dq_flags &= ~XFS_DQ_DIRTY;
+ xfs_dqfunlock(dqp);
+ return XFS_ERROR(EIO);
+ }
+
+ /*
+ * Get the buffer containing the on-disk dquot
+ */
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ if (error) {
+ ASSERT(error != ENOENT);
+ xfs_dqfunlock(dqp);
+ return error;
+ }
+
+ /*
+ * Calculate the location of the dquot inside the buffer.
+ */
+ ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+
+ /*
+ * A simple sanity check in case we got a corrupted dquot..
+ */
+ error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+ XFS_QMOPT_DOWARN, "dqflush (incore copy)");
+ if (error) {
+ xfs_buf_relse(bp);
+ xfs_dqfunlock(dqp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return XFS_ERROR(EIO);
+ }
+
+ /* This is the only portion of data that needs to persist */
+ memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
+
+ /*
+ * Clear the dirty field and remember the flush lsn for later use.
+ */
+ dqp->dq_flags &= ~XFS_DQ_DIRTY;
+
+ xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
+ &dqp->q_logitem.qli_item.li_lsn);
+
+ /*
+ * Attach an iodone routine so that we can remove this dquot from the
+ * AIL and release the flush lock once the dquot is synced to disk.
+ */
+ xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
+ &dqp->q_logitem.qli_item);
+
+ /*
+ * If the buffer is pinned then push on the log so we won't
+ * get stuck waiting in the write for too long.
+ */
+ if (XFS_BUF_ISPINNED(bp)) {
+ trace_xfs_dqflush_force(dqp);
+ xfs_log_force(mp, 0);
+ }
+
+ if (flags & SYNC_WAIT)
+ error = xfs_bwrite(mp, bp);
+ else
+ xfs_bdwrite(mp, bp);
+
+ trace_xfs_dqflush_done(dqp);
+
+ /*
+ * dqp is still locked, but caller is free to unlock it now.
+ */
+ return error;
+
+}
+
+int
+xfs_qm_dqlock_nowait(
+ xfs_dquot_t *dqp)
+{
+ return mutex_trylock(&dqp->q_qlock);
+}
+
+void
+xfs_dqlock(
+ xfs_dquot_t *dqp)
+{
+ mutex_lock(&dqp->q_qlock);
+}
+
+void
+xfs_dqunlock(
+ xfs_dquot_t *dqp)
+{
+ mutex_unlock(&(dqp->q_qlock));
+ if (dqp->q_logitem.qli_dquot == dqp) {
+ /* Once was dqp->q_mount, but might just have been cleared */
+ xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
+ (xfs_log_item_t*)&(dqp->q_logitem));
+ }
+}
+
+
+void
+xfs_dqunlock_nonotify(
+ xfs_dquot_t *dqp)
+{
+ mutex_unlock(&(dqp->q_qlock));
+}
+
+/*
+ * Lock two xfs_dquot structures.
+ *
+ * To avoid deadlocks we always lock the quota structure with
+ * the lowerd id first.
+ */
+void
+xfs_dqlock2(
+ xfs_dquot_t *d1,
+ xfs_dquot_t *d2)
+{
+ if (d1 && d2) {
+ ASSERT(d1 != d2);
+ if (be32_to_cpu(d1->q_core.d_id) >
+ be32_to_cpu(d2->q_core.d_id)) {
+ mutex_lock(&d2->q_qlock);
+ mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
+ } else {
+ mutex_lock(&d1->q_qlock);
+ mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
+ }
+ } else if (d1) {
+ mutex_lock(&d1->q_qlock);
+ } else if (d2) {
+ mutex_lock(&d2->q_qlock);
+ }
+}
+
+
+/*
+ * Take a dquot out of the mount's dqlist as well as the hashlist.
+ * This is called via unmount as well as quotaoff, and the purge
+ * will always succeed unless there are soft (temp) references
+ * outstanding.
+ *
+ * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
+ * that we're returning! XXXsup - not cool.
+ */
+/* ARGSUSED */
+int
+xfs_qm_dqpurge(
+ xfs_dquot_t *dqp)
+{
+ xfs_dqhash_t *qh = dqp->q_hash;
+ xfs_mount_t *mp = dqp->q_mount;
+
+ ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
+ ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
+
+ xfs_dqlock(dqp);
+ /*
+ * We really can't afford to purge a dquot that is
+ * referenced, because these are hard refs.
+ * It shouldn't happen in general because we went thru _all_ inodes in
+ * dqrele_all_inodes before calling this and didn't let the mountlock go.
+ * However it is possible that we have dquots with temporary
+ * references that are not attached to an inode. e.g. see xfs_setattr().
+ */
+ if (dqp->q_nrefs != 0) {
+ xfs_dqunlock(dqp);
+ mutex_unlock(&dqp->q_hash->qh_lock);
+ return (1);
+ }
+
+ ASSERT(!list_empty(&dqp->q_freelist));
+
+ /*
+ * If we're turning off quotas, we have to make sure that, for
+ * example, we don't delete quota disk blocks while dquots are
+ * in the process of getting written to those disk blocks.
+ * This dquot might well be on AIL, and we can't leave it there
+ * if we're turning off quotas. Basically, we need this flush
+ * lock, and are willing to block on it.
+ */
+ if (!xfs_dqflock_nowait(dqp)) {
+ /*
+ * Block on the flush lock after nudging dquot buffer,
+ * if it is incore.
+ */
+ xfs_qm_dqflock_pushbuf_wait(dqp);
+ }
+
+ /*
+ * XXXIf we're turning this type of quotas off, we don't care
+ * about the dirty metadata sitting in this dquot. OTOH, if
+ * we're unmounting, we do care, so we flush it and wait.
+ */
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ int error;
+
+ /* dqflush unlocks dqflock */
+ /*
+ * Given that dqpurge is a very rare occurrence, it is OK
+ * that we're holding the hashlist and mplist locks
+ * across the disk write. But, ... XXXsup
+ *
+ * We don't care about getting disk errors here. We need
+ * to purge this dquot anyway, so we go ahead regardless.
+ */
+ error = xfs_qm_dqflush(dqp, SYNC_WAIT);
+ if (error)
+ xfs_warn(mp, "%s: dquot %p flush failed",
+ __func__, dqp);
+ xfs_dqflock(dqp);
+ }
+ ASSERT(atomic_read(&dqp->q_pincount) == 0);
+ ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+ !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+
+ list_del_init(&dqp->q_hashlist);
+ qh->qh_version++;
+ list_del_init(&dqp->q_mplist);
+ mp->m_quotainfo->qi_dqreclaims++;
+ mp->m_quotainfo->qi_dquots--;
+ /*
+ * XXX Move this to the front of the freelist, if we can get the
+ * freelist lock.
+ */
+ ASSERT(!list_empty(&dqp->q_freelist));
+
+ dqp->q_mount = NULL;
+ dqp->q_hash = NULL;
+ dqp->dq_flags = XFS_DQ_INACTIVE;
+ memset(&dqp->q_core, 0, sizeof(dqp->q_core));
+ xfs_dqfunlock(dqp);
+ xfs_dqunlock(dqp);
+ mutex_unlock(&qh->qh_lock);
+ return (0);
+}
+
+
+#ifdef QUOTADEBUG
+void
+xfs_qm_dqprint(xfs_dquot_t *dqp)
+{
+ struct xfs_mount *mp = dqp->q_mount;
+
+ xfs_debug(mp, "-----------KERNEL DQUOT----------------");
+ xfs_debug(mp, "---- dquotID = %d",
+ (int)be32_to_cpu(dqp->q_core.d_id));
+ xfs_debug(mp, "---- type = %s", DQFLAGTO_TYPESTR(dqp));
+ xfs_debug(mp, "---- fs = 0x%p", dqp->q_mount);
+ xfs_debug(mp, "---- blkno = 0x%x", (int) dqp->q_blkno);
+ xfs_debug(mp, "---- boffset = 0x%x", (int) dqp->q_bufoffset);
+ xfs_debug(mp, "---- blkhlimit = %Lu (0x%x)",
+ be64_to_cpu(dqp->q_core.d_blk_hardlimit),
+ (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
+ xfs_debug(mp, "---- blkslimit = %Lu (0x%x)",
+ be64_to_cpu(dqp->q_core.d_blk_softlimit),
+ (int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
+ xfs_debug(mp, "---- inohlimit = %Lu (0x%x)",
+ be64_to_cpu(dqp->q_core.d_ino_hardlimit),
+ (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
+ xfs_debug(mp, "---- inoslimit = %Lu (0x%x)",
+ be64_to_cpu(dqp->q_core.d_ino_softlimit),
+ (int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
+ xfs_debug(mp, "---- bcount = %Lu (0x%x)",
+ be64_to_cpu(dqp->q_core.d_bcount),
+ (int)be64_to_cpu(dqp->q_core.d_bcount));
+ xfs_debug(mp, "---- icount = %Lu (0x%x)",
+ be64_to_cpu(dqp->q_core.d_icount),
+ (int)be64_to_cpu(dqp->q_core.d_icount));
+ xfs_debug(mp, "---- btimer = %d",
+ (int)be32_to_cpu(dqp->q_core.d_btimer));
+ xfs_debug(mp, "---- itimer = %d",
+ (int)be32_to_cpu(dqp->q_core.d_itimer));
+ xfs_debug(mp, "---------------------------");
+}
+#endif
+
+/*
+ * Give the buffer a little push if it is incore and
+ * wait on the flush lock.
+ */
+void
+xfs_qm_dqflock_pushbuf_wait(
+ xfs_dquot_t *dqp)
+{
+ xfs_mount_t *mp = dqp->q_mount;
+ xfs_buf_t *bp;
+
+ /*
+ * Check to see if the dquot has been flushed delayed
+ * write. If so, grab its buffer and send it
+ * out immediately. We'll be able to acquire
+ * the flush lock when the I/O completes.
+ */
+ bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
+ if (!bp)
+ goto out_lock;
+
+ if (XFS_BUF_ISDELAYWRITE(bp)) {
+ if (XFS_BUF_ISPINNED(bp))
+ xfs_log_force(mp, 0);
+ xfs_buf_delwri_promote(bp);
+ wake_up_process(bp->b_target->bt_task);
+ }
+ xfs_buf_relse(bp);
+out_lock:
+ xfs_dqflock(dqp);
+}
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
new file mode 100644
index 00000000..5da3a23b
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DQUOT_H__
+#define __XFS_DQUOT_H__
+
+/*
+ * Dquots are structures that hold quota information about a user or a group,
+ * much like inodes are for files. In fact, dquots share many characteristics
+ * with inodes. However, dquots can also be a centralized resource, relative
+ * to a collection of inodes. In this respect, dquots share some characteristics
+ * of the superblock.
+ * XFS dquots exploit both those in its algorithms. They make every attempt
+ * to not be a bottleneck when quotas are on and have minimal impact, if any,
+ * when quotas are off.
+ */
+
+/*
+ * The hash chain headers (hash buckets)
+ */
+typedef struct xfs_dqhash {
+ struct list_head qh_list;
+ struct mutex qh_lock;
+ uint qh_version; /* ever increasing version */
+ uint qh_nelems; /* number of dquots on the list */
+} xfs_dqhash_t;
+
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * The incore dquot structure
+ */
+typedef struct xfs_dquot {
+ uint dq_flags; /* various flags (XFS_DQ_*) */
+ struct list_head q_freelist; /* global free list of dquots */
+ struct list_head q_mplist; /* mount's list of dquots */
+ struct list_head q_hashlist; /* gloabl hash list of dquots */
+ xfs_dqhash_t *q_hash; /* the hashchain header */
+ struct xfs_mount*q_mount; /* filesystem this relates to */
+ struct xfs_trans*q_transp; /* trans this belongs to currently */
+ uint q_nrefs; /* # active refs from inodes */
+ xfs_daddr_t q_blkno; /* blkno of dquot buffer */
+ int q_bufoffset; /* off of dq in buffer (# dquots) */
+ xfs_fileoff_t q_fileoffset; /* offset in quotas file */
+
+ struct xfs_dquot*q_gdquot; /* group dquot, hint only */
+ xfs_disk_dquot_t q_core; /* actual usage & quotas */
+ xfs_dq_logitem_t q_logitem; /* dquot log item */
+ xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
+ xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
+ xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
+ struct mutex q_qlock; /* quota lock */
+ struct completion q_flush; /* flush completion queue */
+ atomic_t q_pincount; /* dquot pin count */
+ wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
+} xfs_dquot_t;
+
+/*
+ * Lock hierarchy for q_qlock:
+ * XFS_QLOCK_NORMAL is the implicit default,
+ * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ */
+enum {
+ XFS_QLOCK_NORMAL = 0,
+ XFS_QLOCK_NESTED,
+};
+
+#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++)
+
+/*
+ * Manage the q_flush completion queue embedded in the dquot. This completion
+ * queue synchronizes processes attempting to flush the in-core dquot back to
+ * disk.
+ */
+static inline void xfs_dqflock(xfs_dquot_t *dqp)
+{
+ wait_for_completion(&dqp->q_flush);
+}
+
+static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+{
+ return try_wait_for_completion(&dqp->q_flush);
+}
+
+static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+{
+ complete(&dqp->q_flush);
+}
+
+#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
+#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
+#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
+#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
+#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
+#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo)
+#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \
+ XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
+ XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
+
+#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
+ (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
+ (XFS_IS_OQUOTA_ON((d)->q_mount))))
+
+#ifdef QUOTADEBUG
+extern void xfs_qm_dqprint(xfs_dquot_t *);
+#else
+#define xfs_qm_dqprint(a)
+#endif
+
+extern void xfs_qm_dqdestroy(xfs_dquot_t *);
+extern int xfs_qm_dqflush(xfs_dquot_t *, uint);
+extern int xfs_qm_dqpurge(xfs_dquot_t *);
+extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
+extern int xfs_qm_dqlock_nowait(xfs_dquot_t *);
+extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
+extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
+ xfs_disk_dquot_t *);
+extern void xfs_qm_adjust_dqlimits(xfs_mount_t *,
+ xfs_disk_dquot_t *);
+extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
+ xfs_dqid_t, uint, uint, xfs_dquot_t **);
+extern void xfs_qm_dqput(xfs_dquot_t *);
+extern void xfs_dqlock(xfs_dquot_t *);
+extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
+extern void xfs_dqunlock(xfs_dquot_t *);
+extern void xfs_dqunlock_nonotify(xfs_dquot_t *);
+
+#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
new file mode 100644
index 00000000..8126fc2e
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+
+static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_dq_logitem, qli_item);
+}
+
+/*
+ * returns the number of iovecs needed to log the given dquot item.
+ */
+STATIC uint
+xfs_qm_dquot_logitem_size(
+ struct xfs_log_item *lip)
+{
+ /*
+ * we need only two iovecs, one for the format, one for the real thing
+ */
+ return 2;
+}
+
+/*
+ * fills in the vector of log iovecs for the given dquot log item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *logvec)
+{
+ struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
+
+ logvec->i_addr = &qlip->qli_format;
+ logvec->i_len = sizeof(xfs_dq_logformat_t);
+ logvec->i_type = XLOG_REG_TYPE_QFORMAT;
+ logvec++;
+ logvec->i_addr = &qlip->qli_dquot->q_core;
+ logvec->i_len = sizeof(xfs_disk_dquot_t);
+ logvec->i_type = XLOG_REG_TYPE_DQUOT;
+
+ ASSERT(2 == lip->li_desc->lid_size);
+ qlip->qli_format.qlf_size = 2;
+
+}
+
+/*
+ * Increment the pin count of the given dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pin(
+ struct xfs_log_item *lip)
+{
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ atomic_inc(&dqp->q_pincount);
+}
+
+/*
+ * Decrement the pin count of the given dquot, and wake up
+ * anyone in xfs_dqwait_unpin() if the count goes to 0. The
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
+ */
+STATIC void
+xfs_qm_dquot_logitem_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+ ASSERT(atomic_read(&dqp->q_pincount) > 0);
+ if (atomic_dec_and_test(&dqp->q_pincount))
+ wake_up(&dqp->q_pinwait);
+}
+
+/*
+ * Given the logitem, this writes the corresponding dquot entry to disk
+ * asynchronously. This is called with the dquot entry securely locked;
+ * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
+ * at the end.
+ */
+STATIC void
+xfs_qm_dquot_logitem_push(
+ struct xfs_log_item *lip)
+{
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+ int error;
+
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ ASSERT(!completion_done(&dqp->q_flush));
+
+ /*
+ * Since we were able to lock the dquot's flush lock and
+ * we found it on the AIL, the dquot must be dirty. This
+ * is because the dquot is removed from the AIL while still
+ * holding the flush lock in xfs_dqflush_done(). Thus, if
+ * we found it in the AIL and were able to obtain the flush
+ * lock without sleeping, then there must not have been
+ * anyone in the process of flushing the dquot.
+ */
+ error = xfs_qm_dqflush(dqp, 0);
+ if (error)
+ xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
+ __func__, error, dqp);
+ xfs_dqunlock(dqp);
+}
+
+STATIC xfs_lsn_t
+xfs_qm_dquot_logitem_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ /*
+ * We always re-log the entire dquot when it becomes dirty,
+ * so, the latest copy _is_ the only one that matters.
+ */
+ return lsn;
+}
+
+/*
+ * This is called to wait for the given dquot to be unpinned.
+ * Most of these pin/unpin routines are plagiarized from inode code.
+ */
+void
+xfs_qm_dqunpin_wait(
+ struct xfs_dquot *dqp)
+{
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ if (atomic_read(&dqp->q_pincount) == 0)
+ return;
+
+ /*
+ * Give the log a push so we don't wait here too long.
+ */
+ xfs_log_force(dqp->q_mount, 0);
+ wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
+}
+
+/*
+ * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
+ * the dquot is locked by us, but the flush lock isn't. So, here we are
+ * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
+ * If so, we want to push it out to help us take this item off the AIL as soon
+ * as possible.
+ *
+ * We must not be holding the AIL lock at this point. Calling incore() to
+ * search the buffer cache can be a time consuming thing, and AIL lock is a
+ * spinlock.
+ */
+STATIC bool
+xfs_qm_dquot_logitem_pushbuf(
+ struct xfs_log_item *lip)
+{
+ struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
+ struct xfs_dquot *dqp = qlip->qli_dquot;
+ struct xfs_buf *bp;
+ bool ret = true;
+
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+ /*
+ * If flushlock isn't locked anymore, chances are that the
+ * inode flush completed and the inode was taken off the AIL.
+ * So, just get out.
+ */
+ if (completion_done(&dqp->q_flush) ||
+ !(lip->li_flags & XFS_LI_IN_AIL)) {
+ xfs_dqunlock(dqp);
+ return true;
+ }
+
+ bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
+ dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
+ xfs_dqunlock(dqp);
+ if (!bp)
+ return true;
+ if (XFS_BUF_ISDELAYWRITE(bp))
+ xfs_buf_delwri_promote(bp);
+ if (XFS_BUF_ISPINNED(bp))
+ ret = false;
+ xfs_buf_relse(bp);
+ return ret;
+}
+
+/*
+ * This is called to attempt to lock the dquot associated with this
+ * dquot log item. Don't sleep on the dquot lock or the flush lock.
+ * If the flush lock is already held, indicating that the dquot has
+ * been or is in the process of being flushed, then see if we can
+ * find the dquot's buffer in the buffer cache without sleeping. If
+ * we can and it is marked delayed write, then we want to send it out.
+ * We delay doing so until the push routine, though, to avoid sleeping
+ * in any device strategy routines.
+ */
+STATIC uint
+xfs_qm_dquot_logitem_trylock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+ if (atomic_read(&dqp->q_pincount) > 0)
+ return XFS_ITEM_PINNED;
+
+ if (!xfs_qm_dqlock_nowait(dqp))
+ return XFS_ITEM_LOCKED;
+
+ if (!xfs_dqflock_nowait(dqp)) {
+ /*
+ * dquot has already been flushed to the backing buffer,
+ * leave it locked, pushbuf routine will unlock it.
+ */
+ return XFS_ITEM_PUSHBUF;
+ }
+
+ ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+ return XFS_ITEM_SUCCESS;
+}
+
+/*
+ * Unlock the dquot associated with the log item.
+ * Clear the fields of the dquot and dquot log item that
+ * are specific to the current transaction. If the
+ * hold flags is set, do not unlock the dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+ /*
+ * Clear the transaction pointer in the dquot
+ */
+ dqp->q_transp = NULL;
+
+ /*
+ * dquots are never 'held' from getting unlocked at the end of
+ * a transaction. Their locking and unlocking is hidden inside the
+ * transaction layer, within trans_commit. Hence, no LI_HOLD flag
+ * for the logitem.
+ */
+ xfs_dqunlock(dqp);
+}
+
+/*
+ * this needs to stamp an lsn into the dquot, I think.
+ * rpc's that look at user dquot's would then have to
+ * push on the dependency recorded in the dquot
+ */
+STATIC void
+xfs_qm_dquot_logitem_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector for dquots
+ */
+static struct xfs_item_ops xfs_dquot_item_ops = {
+ .iop_size = xfs_qm_dquot_logitem_size,
+ .iop_format = xfs_qm_dquot_logitem_format,
+ .iop_pin = xfs_qm_dquot_logitem_pin,
+ .iop_unpin = xfs_qm_dquot_logitem_unpin,
+ .iop_trylock = xfs_qm_dquot_logitem_trylock,
+ .iop_unlock = xfs_qm_dquot_logitem_unlock,
+ .iop_committed = xfs_qm_dquot_logitem_committed,
+ .iop_push = xfs_qm_dquot_logitem_push,
+ .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf,
+ .iop_committing = xfs_qm_dquot_logitem_committing
+};
+
+/*
+ * Initialize the dquot log item for a newly allocated dquot.
+ * The dquot isn't locked at this point, but it isn't on any of the lists
+ * either, so we don't care.
+ */
+void
+xfs_qm_dquot_logitem_init(
+ struct xfs_dquot *dqp)
+{
+ struct xfs_dq_logitem *lp = &dqp->q_logitem;
+
+ xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
+ &xfs_dquot_item_ops);
+ lp->qli_dquot = dqp;
+ lp->qli_format.qlf_type = XFS_LI_DQUOT;
+ lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
+ lp->qli_format.qlf_blkno = dqp->q_blkno;
+ lp->qli_format.qlf_len = 1;
+ /*
+ * This is just the offset of this dquot within its buffer
+ * (which is currently 1 FSB and probably won't change).
+ * Hence 32 bits for this offset should be just fine.
+ * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
+ * here, and recompute it at recovery time.
+ */
+ lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
+}
+
+/*------------------ QUOTAOFF LOG ITEMS -------------------*/
+
+static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_qoff_logitem, qql_item);
+}
+
+
+/*
+ * This returns the number of iovecs needed to log the given quotaoff item.
+ * We only need 1 iovec for an quotaoff item. It just logs the
+ * quotaoff_log_format structure.
+ */
+STATIC uint
+xfs_qm_qoff_logitem_size(
+ struct xfs_log_item *lip)
+{
+ return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given quotaoff log item. We use only 1 iovec, and we point that
+ * at the quotaoff_log_format structure embedded in the quotaoff item.
+ * It is at this point that we assert that all of the extent
+ * slots in the quotaoff item have been filled.
+ */
+STATIC void
+xfs_qm_qoff_logitem_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *log_vector)
+{
+ struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
+
+ ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
+
+ log_vector->i_addr = &qflip->qql_format;
+ log_vector->i_len = sizeof(xfs_qoff_logitem_t);
+ log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
+ qflip->qql_format.qf_size = 1;
+}
+
+/*
+ * Pinning has no meaning for an quotaoff item, so just return.
+ */
+STATIC void
+xfs_qm_qoff_logitem_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an quotaoff item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_qm_qoff_logitem_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+}
+
+/*
+ * Quotaoff items have no locking, so just return success.
+ */
+STATIC uint
+xfs_qm_qoff_logitem_trylock(
+ struct xfs_log_item *lip)
+{
+ return XFS_ITEM_LOCKED;
+}
+
+/*
+ * Quotaoff items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+STATIC void
+xfs_qm_qoff_logitem_unlock(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * The quotaoff-start-item is logged only once and cannot be moved in the log,
+ * so simply return the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_qm_qoff_logitem_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ return lsn;
+}
+
+/*
+ * There isn't much you can do to push on an quotaoff item. It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+STATIC void
+xfs_qm_qoff_logitem_push(
+ struct xfs_log_item *lip)
+{
+}
+
+
+STATIC xfs_lsn_t
+xfs_qm_qoffend_logitem_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
+ struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
+ struct xfs_ail *ailp = qfs->qql_item.li_ailp;
+
+ /*
+ * Delete the qoff-start logitem from the AIL.
+ * xfs_trans_ail_delete() drops the AIL lock.
+ */
+ spin_lock(&ailp->xa_lock);
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
+
+ kmem_free(qfs);
+ kmem_free(qfe);
+ return (xfs_lsn_t)-1;
+}
+
+/*
+ * XXX rcc - don't know quite what to do with this. I think we can
+ * just ignore it. The only time that isn't the case is if we allow
+ * the client to somehow see that quotas have been turned off in which
+ * we can't allow that to get back until the quotaoff hits the disk.
+ * So how would that happen? Also, do we need different routines for
+ * quotaoff start and quotaoff end? I suspect the answer is yes but
+ * to be sure, I need to look at the recovery code and see how quota off
+ * recovery is handled (do we roll forward or back or do something else).
+ * If we roll forwards or backwards, then we need two separate routines,
+ * one that does nothing and one that stamps in the lsn that matters
+ * (truly makes the quotaoff irrevocable). If we do something else,
+ * then maybe we don't need two.
+ */
+STATIC void
+xfs_qm_qoff_logitem_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t commit_lsn)
+{
+}
+
+static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+ .iop_size = xfs_qm_qoff_logitem_size,
+ .iop_format = xfs_qm_qoff_logitem_format,
+ .iop_pin = xfs_qm_qoff_logitem_pin,
+ .iop_unpin = xfs_qm_qoff_logitem_unpin,
+ .iop_trylock = xfs_qm_qoff_logitem_trylock,
+ .iop_unlock = xfs_qm_qoff_logitem_unlock,
+ .iop_committed = xfs_qm_qoffend_logitem_committed,
+ .iop_push = xfs_qm_qoff_logitem_push,
+ .iop_committing = xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * This is the ops vector shared by all quotaoff-start log items.
+ */
+static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+ .iop_size = xfs_qm_qoff_logitem_size,
+ .iop_format = xfs_qm_qoff_logitem_format,
+ .iop_pin = xfs_qm_qoff_logitem_pin,
+ .iop_unpin = xfs_qm_qoff_logitem_unpin,
+ .iop_trylock = xfs_qm_qoff_logitem_trylock,
+ .iop_unlock = xfs_qm_qoff_logitem_unlock,
+ .iop_committed = xfs_qm_qoff_logitem_committed,
+ .iop_push = xfs_qm_qoff_logitem_push,
+ .iop_committing = xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * Allocate and initialize an quotaoff item of the correct quota type(s).
+ */
+struct xfs_qoff_logitem *
+xfs_qm_qoff_logitem_init(
+ struct xfs_mount *mp,
+ struct xfs_qoff_logitem *start,
+ uint flags)
+{
+ struct xfs_qoff_logitem *qf;
+
+ qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
+
+ xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
+ &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
+ qf->qql_item.li_mountp = mp;
+ qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
+ qf->qql_format.qf_flags = flags;
+ qf->qql_start_lip = start;
+ return qf;
+}
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
new file mode 100644
index 00000000..5acae2ad
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DQUOT_ITEM_H__
+#define __XFS_DQUOT_ITEM_H__
+
+struct xfs_dquot;
+struct xfs_trans;
+struct xfs_mount;
+struct xfs_qoff_logitem;
+
+typedef struct xfs_dq_logitem {
+ xfs_log_item_t qli_item; /* common portion */
+ struct xfs_dquot *qli_dquot; /* dquot ptr */
+ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
+ xfs_dq_logformat_t qli_format; /* logged structure */
+} xfs_dq_logitem_t;
+
+typedef struct xfs_qoff_logitem {
+ xfs_log_item_t qql_item; /* common portion */
+ struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
+ xfs_qoff_logformat_t qql_format; /* logged structure */
+} xfs_qoff_logitem_t;
+
+
+extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *);
+extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
+ struct xfs_qoff_logitem *, uint);
+extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
+ struct xfs_qoff_logitem *, uint);
+extern void xfs_trans_log_quotaoff_item(struct xfs_trans *,
+ struct xfs_qoff_logitem *);
+
+#endif /* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
new file mode 100644
index 00000000..e70c7fc9
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm.c
@@ -0,0 +1,2462 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_utils.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+
+/*
+ * The global quota manager. There is only one of these for the entire
+ * system, _not_ one per file system. XQM keeps track of the overall
+ * quota functionality, including maintaining the freelist and hash
+ * tables of dquots.
+ */
+struct mutex xfs_Gqm_lock;
+struct xfs_qm *xfs_Gqm;
+uint ndquot;
+
+kmem_zone_t *qm_dqzone;
+kmem_zone_t *qm_dqtrxzone;
+
+STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int);
+STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
+
+STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
+STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
+STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
+
+static struct shrinker xfs_qm_shaker = {
+ .shrink = xfs_qm_shake,
+ .seeks = DEFAULT_SEEKS,
+};
+
+#ifdef DEBUG
+extern struct mutex qcheck_lock;
+#endif
+
+#ifdef QUOTADEBUG
+static void
+xfs_qm_dquot_list_print(
+ struct xfs_mount *mp)
+{
+ xfs_dquot_t *dqp;
+ int i = 0;
+
+ list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
+ xfs_debug(mp, " %d. \"%d (%s)\" "
+ "bcnt = %lld, icnt = %lld, refs = %d",
+ i++, be32_to_cpu(dqp->q_core.d_id),
+ DQFLAGTO_TYPESTR(dqp),
+ (long long)be64_to_cpu(dqp->q_core.d_bcount),
+ (long long)be64_to_cpu(dqp->q_core.d_icount),
+ dqp->q_nrefs);
+ }
+}
+#else
+static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
+#endif
+
+/*
+ * Initialize the XQM structure.
+ * Note that there is not one quota manager per file system.
+ */
+STATIC struct xfs_qm *
+xfs_Gqm_init(void)
+{
+ xfs_dqhash_t *udqhash, *gdqhash;
+ xfs_qm_t *xqm;
+ size_t hsize;
+ uint i;
+
+ /*
+ * Initialize the dquot hash tables.
+ */
+ udqhash = kmem_zalloc_greedy(&hsize,
+ XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
+ XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
+ if (!udqhash)
+ goto out;
+
+ gdqhash = kmem_zalloc_large(hsize);
+ if (!gdqhash)
+ goto out_free_udqhash;
+
+ hsize /= sizeof(xfs_dqhash_t);
+ ndquot = hsize << 8;
+
+ xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
+ xqm->qm_dqhashmask = hsize - 1;
+ xqm->qm_usr_dqhtable = udqhash;
+ xqm->qm_grp_dqhtable = gdqhash;
+ ASSERT(xqm->qm_usr_dqhtable != NULL);
+ ASSERT(xqm->qm_grp_dqhtable != NULL);
+
+ for (i = 0; i < hsize; i++) {
+ xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
+ xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
+ }
+
+ /*
+ * Freelist of all dquots of all file systems
+ */
+ INIT_LIST_HEAD(&xqm->qm_dqfrlist);
+ xqm->qm_dqfrlist_cnt = 0;
+ mutex_init(&xqm->qm_dqfrlist_lock);
+
+ /*
+ * dquot zone. we register our own low-memory callback.
+ */
+ if (!qm_dqzone) {
+ xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
+ "xfs_dquots");
+ qm_dqzone = xqm->qm_dqzone;
+ } else
+ xqm->qm_dqzone = qm_dqzone;
+
+ register_shrinker(&xfs_qm_shaker);
+
+ /*
+ * The t_dqinfo portion of transactions.
+ */
+ if (!qm_dqtrxzone) {
+ xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
+ "xfs_dqtrx");
+ qm_dqtrxzone = xqm->qm_dqtrxzone;
+ } else
+ xqm->qm_dqtrxzone = qm_dqtrxzone;
+
+ atomic_set(&xqm->qm_totaldquots, 0);
+ xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
+ xqm->qm_nrefs = 0;
+#ifdef DEBUG
+ mutex_init(&qcheck_lock);
+#endif
+ return xqm;
+
+ out_free_udqhash:
+ kmem_free_large(udqhash);
+ out:
+ return NULL;
+}
+
+/*
+ * Destroy the global quota manager when its reference count goes to zero.
+ */
+STATIC void
+xfs_qm_destroy(
+ struct xfs_qm *xqm)
+{
+ struct xfs_dquot *dqp, *n;
+ int hsize, i;
+
+ ASSERT(xqm != NULL);
+ ASSERT(xqm->qm_nrefs == 0);
+ unregister_shrinker(&xfs_qm_shaker);
+ hsize = xqm->qm_dqhashmask + 1;
+ for (i = 0; i < hsize; i++) {
+ xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
+ xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
+ }
+ kmem_free_large(xqm->qm_usr_dqhtable);
+ kmem_free_large(xqm->qm_grp_dqhtable);
+ xqm->qm_usr_dqhtable = NULL;
+ xqm->qm_grp_dqhtable = NULL;
+ xqm->qm_dqhashmask = 0;
+
+ /* frlist cleanup */
+ mutex_lock(&xqm->qm_dqfrlist_lock);
+ list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
+ xfs_dqlock(dqp);
+#ifdef QUOTADEBUG
+ xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp);
+#endif
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
+ xfs_dqunlock(dqp);
+ xfs_qm_dqdestroy(dqp);
+ }
+ mutex_unlock(&xqm->qm_dqfrlist_lock);
+ mutex_destroy(&xqm->qm_dqfrlist_lock);
+#ifdef DEBUG
+ mutex_destroy(&qcheck_lock);
+#endif
+ kmem_free(xqm);
+}
+
+/*
+ * Called at mount time to let XQM know that another file system is
+ * starting quotas. This isn't crucial information as the individual mount
+ * structures are pretty independent, but it helps the XQM keep a
+ * global view of what's going on.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_hold_quotafs_ref(
+ struct xfs_mount *mp)
+{
+ /*
+ * Need to lock the xfs_Gqm structure for things like this. For example,
+ * the structure could disappear between the entry to this routine and
+ * a HOLD operation if not locked.
+ */
+ mutex_lock(&xfs_Gqm_lock);
+
+ if (!xfs_Gqm) {
+ xfs_Gqm = xfs_Gqm_init();
+ if (!xfs_Gqm) {
+ mutex_unlock(&xfs_Gqm_lock);
+ return ENOMEM;
+ }
+ }
+
+ /*
+ * We can keep a list of all filesystems with quotas mounted for
+ * debugging and statistical purposes, but ...
+ * Just take a reference and get out.
+ */
+ xfs_Gqm->qm_nrefs++;
+ mutex_unlock(&xfs_Gqm_lock);
+
+ return 0;
+}
+
+
+/*
+ * Release the reference that a filesystem took at mount time,
+ * so that we know when we need to destroy the entire quota manager.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_rele_quotafs_ref(
+ struct xfs_mount *mp)
+{
+ xfs_dquot_t *dqp, *n;
+
+ ASSERT(xfs_Gqm);
+ ASSERT(xfs_Gqm->qm_nrefs > 0);
+
+ /*
+ * Go thru the freelist and destroy all inactive dquots.
+ */
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+
+ list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
+ xfs_dqlock(dqp);
+ if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+ ASSERT(dqp->q_mount == NULL);
+ ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+ ASSERT(list_empty(&dqp->q_hashlist));
+ ASSERT(list_empty(&dqp->q_mplist));
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
+ xfs_dqunlock(dqp);
+ xfs_qm_dqdestroy(dqp);
+ } else {
+ xfs_dqunlock(dqp);
+ }
+ }
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+
+ /*
+ * Destroy the entire XQM. If somebody mounts with quotaon, this'll
+ * be restarted.
+ */
+ mutex_lock(&xfs_Gqm_lock);
+ if (--xfs_Gqm->qm_nrefs == 0) {
+ xfs_qm_destroy(xfs_Gqm);
+ xfs_Gqm = NULL;
+ }
+ mutex_unlock(&xfs_Gqm_lock);
+}
+
+/*
+ * Just destroy the quotainfo structure.
+ */
+void
+xfs_qm_unmount(
+ struct xfs_mount *mp)
+{
+ if (mp->m_quotainfo) {
+ xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+ xfs_qm_destroy_quotainfo(mp);
+ }
+}
+
+
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo. This is also responsible for
+ * running a quotacheck as necessary. We are guaranteed that the superblock
+ * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
+ */
+void
+xfs_qm_mount_quotas(
+ xfs_mount_t *mp)
+{
+ int error = 0;
+ uint sbf;
+
+ /*
+ * If quotas on realtime volumes is not supported, we disable
+ * quotas immediately.
+ */
+ if (mp->m_sb.sb_rextents) {
+ xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
+ mp->m_qflags = 0;
+ goto write_changes;
+ }
+
+ ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+ /*
+ * Allocate the quotainfo structure inside the mount struct, and
+ * create quotainode(s), and change/rev superblock if necessary.
+ */
+ error = xfs_qm_init_quotainfo(mp);
+ if (error) {
+ /*
+ * We must turn off quotas.
+ */
+ ASSERT(mp->m_quotainfo == NULL);
+ mp->m_qflags = 0;
+ goto write_changes;
+ }
+ /*
+ * If any of the quotas are not consistent, do a quotacheck.
+ */
+ if (XFS_QM_NEED_QUOTACHECK(mp)) {
+ error = xfs_qm_quotacheck(mp);
+ if (error) {
+ /* Quotacheck failed and disabled quotas. */
+ return;
+ }
+ }
+ /*
+ * If one type of quotas is off, then it will lose its
+ * quotachecked status, since we won't be doing accounting for
+ * that type anymore.
+ */
+ if (!XFS_IS_UQUOTA_ON(mp))
+ mp->m_qflags &= ~XFS_UQUOTA_CHKD;
+ if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
+ mp->m_qflags &= ~XFS_OQUOTA_CHKD;
+
+ write_changes:
+ /*
+ * We actually don't have to acquire the m_sb_lock at all.
+ * This can only be called from mount, and that's single threaded. XXX
+ */
+ spin_lock(&mp->m_sb_lock);
+ sbf = mp->m_sb.sb_qflags;
+ mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+ spin_unlock(&mp->m_sb_lock);
+
+ if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+ if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+ /*
+ * We could only have been turning quotas off.
+ * We aren't in very good shape actually because
+ * the incore structures are convinced that quotas are
+ * off, but the on disk superblock doesn't know that !
+ */
+ ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+ xfs_alert(mp, "%s: Superblock update failed!",
+ __func__);
+ }
+ }
+
+ if (error) {
+ xfs_warn(mp, "Failed to initialize disk quotas.");
+ return;
+ }
+
+#ifdef QUOTADEBUG
+ if (XFS_IS_QUOTA_ON(mp))
+ xfs_qm_internalqcheck(mp);
+#endif
+}
+
+/*
+ * Called from the vfsops layer.
+ */
+void
+xfs_qm_unmount_quotas(
+ xfs_mount_t *mp)
+{
+ /*
+ * Release the dquots that root inode, et al might be holding,
+ * before we flush quotas and blow away the quotainfo structure.
+ */
+ ASSERT(mp->m_rootip);
+ xfs_qm_dqdetach(mp->m_rootip);
+ if (mp->m_rbmip)
+ xfs_qm_dqdetach(mp->m_rbmip);
+ if (mp->m_rsumip)
+ xfs_qm_dqdetach(mp->m_rsumip);
+
+ /*
+ * Release the quota inodes.
+ */
+ if (mp->m_quotainfo) {
+ if (mp->m_quotainfo->qi_uquotaip) {
+ IRELE(mp->m_quotainfo->qi_uquotaip);
+ mp->m_quotainfo->qi_uquotaip = NULL;
+ }
+ if (mp->m_quotainfo->qi_gquotaip) {
+ IRELE(mp->m_quotainfo->qi_gquotaip);
+ mp->m_quotainfo->qi_gquotaip = NULL;
+ }
+ }
+}
+
+/*
+ * Flush all dquots of the given file system to disk. The dquots are
+ * _not_ purged from memory here, just their data written to disk.
+ */
+STATIC int
+xfs_qm_dqflush_all(
+ struct xfs_mount *mp,
+ int sync_mode)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ int recl;
+ struct xfs_dquot *dqp;
+ int error;
+
+ if (!q)
+ return 0;
+again:
+ mutex_lock(&q->qi_dqlist_lock);
+ list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
+ xfs_dqlock(dqp);
+ if (! XFS_DQ_IS_DIRTY(dqp)) {
+ xfs_dqunlock(dqp);
+ continue;
+ }
+
+ /* XXX a sentinel would be better */
+ recl = q->qi_dqreclaims;
+ if (!xfs_dqflock_nowait(dqp)) {
+ /*
+ * If we can't grab the flush lock then check
+ * to see if the dquot has been flushed delayed
+ * write. If so, grab its buffer and send it
+ * out immediately. We'll be able to acquire
+ * the flush lock when the I/O completes.
+ */
+ xfs_qm_dqflock_pushbuf_wait(dqp);
+ }
+ /*
+ * Let go of the mplist lock. We don't want to hold it
+ * across a disk write.
+ */
+ mutex_unlock(&q->qi_dqlist_lock);
+ error = xfs_qm_dqflush(dqp, sync_mode);
+ xfs_dqunlock(dqp);
+ if (error)
+ return error;
+
+ mutex_lock(&q->qi_dqlist_lock);
+ if (recl != q->qi_dqreclaims) {
+ mutex_unlock(&q->qi_dqlist_lock);
+ /* XXX restart limit */
+ goto again;
+ }
+ }
+
+ mutex_unlock(&q->qi_dqlist_lock);
+ /* return ! busy */
+ return 0;
+}
+/*
+ * Release the group dquot pointers the user dquots may be
+ * carrying around as a hint. mplist is locked on entry and exit.
+ */
+STATIC void
+xfs_qm_detach_gdquots(
+ struct xfs_mount *mp)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ struct xfs_dquot *dqp, *gdqp;
+ int nrecl;
+
+ again:
+ ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+ list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
+ xfs_dqlock(dqp);
+ if ((gdqp = dqp->q_gdquot)) {
+ xfs_dqlock(gdqp);
+ dqp->q_gdquot = NULL;
+ }
+ xfs_dqunlock(dqp);
+
+ if (gdqp) {
+ /*
+ * Can't hold the mplist lock across a dqput.
+ * XXXmust convert to marker based iterations here.
+ */
+ nrecl = q->qi_dqreclaims;
+ mutex_unlock(&q->qi_dqlist_lock);
+ xfs_qm_dqput(gdqp);
+
+ mutex_lock(&q->qi_dqlist_lock);
+ if (nrecl != q->qi_dqreclaims)
+ goto again;
+ }
+ }
+}
+
+/*
+ * Go through all the incore dquots of this file system and take them
+ * off the mplist and hashlist, if the dquot type matches the dqtype
+ * parameter. This is used when turning off quota accounting for
+ * users and/or groups, as well as when the filesystem is unmounting.
+ */
+STATIC int
+xfs_qm_dqpurge_int(
+ struct xfs_mount *mp,
+ uint flags)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ struct xfs_dquot *dqp, *n;
+ uint dqtype;
+ int nrecl;
+ int nmisses;
+
+ if (!q)
+ return 0;
+
+ dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
+ dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
+ dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
+
+ mutex_lock(&q->qi_dqlist_lock);
+
+ /*
+ * In the first pass through all incore dquots of this filesystem,
+ * we release the group dquot pointers the user dquots may be
+ * carrying around as a hint. We need to do this irrespective of
+ * what's being turned off.
+ */
+ xfs_qm_detach_gdquots(mp);
+
+ again:
+ nmisses = 0;
+ ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+ /*
+ * Try to get rid of all of the unwanted dquots. The idea is to
+ * get them off mplist and hashlist, but leave them on freelist.
+ */
+ list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
+ /*
+ * It's OK to look at the type without taking dqlock here.
+ * We're holding the mplist lock here, and that's needed for
+ * a dqreclaim.
+ */
+ if ((dqp->dq_flags & dqtype) == 0)
+ continue;
+
+ if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
+ nrecl = q->qi_dqreclaims;
+ mutex_unlock(&q->qi_dqlist_lock);
+ mutex_lock(&dqp->q_hash->qh_lock);
+ mutex_lock(&q->qi_dqlist_lock);
+
+ /*
+ * XXXTheoretically, we can get into a very long
+ * ping pong game here.
+ * No one can be adding dquots to the mplist at
+ * this point, but somebody might be taking things off.
+ */
+ if (nrecl != q->qi_dqreclaims) {
+ mutex_unlock(&dqp->q_hash->qh_lock);
+ goto again;
+ }
+ }
+
+ /*
+ * Take the dquot off the mplist and hashlist. It may remain on
+ * freelist in INACTIVE state.
+ */
+ nmisses += xfs_qm_dqpurge(dqp);
+ }
+ mutex_unlock(&q->qi_dqlist_lock);
+ return nmisses;
+}
+
+int
+xfs_qm_dqpurge_all(
+ xfs_mount_t *mp,
+ uint flags)
+{
+ int ndquots;
+
+ /*
+ * Purge the dquot cache.
+ * None of the dquots should really be busy at this point.
+ */
+ if (mp->m_quotainfo) {
+ while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
+ delay(ndquots * 10);
+ }
+ }
+ return 0;
+}
+
+STATIC int
+xfs_qm_dqattach_one(
+ xfs_inode_t *ip,
+ xfs_dqid_t id,
+ uint type,
+ uint doalloc,
+ xfs_dquot_t *udqhint, /* hint */
+ xfs_dquot_t **IO_idqpp)
+{
+ xfs_dquot_t *dqp;
+ int error;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ error = 0;
+
+ /*
+ * See if we already have it in the inode itself. IO_idqpp is
+ * &i_udquot or &i_gdquot. This made the code look weird, but
+ * made the logic a lot simpler.
+ */
+ dqp = *IO_idqpp;
+ if (dqp) {
+ trace_xfs_dqattach_found(dqp);
+ return 0;
+ }
+
+ /*
+ * udqhint is the i_udquot field in inode, and is non-NULL only
+ * when the type arg is group/project. Its purpose is to save a
+ * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
+ * the user dquot.
+ */
+ if (udqhint) {
+ ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
+ xfs_dqlock(udqhint);
+
+ /*
+ * No need to take dqlock to look at the id.
+ *
+ * The ID can't change until it gets reclaimed, and it won't
+ * be reclaimed as long as we have a ref from inode and we
+ * hold the ilock.
+ */
+ dqp = udqhint->q_gdquot;
+ if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
+ xfs_dqlock(dqp);
+ XFS_DQHOLD(dqp);
+ ASSERT(*IO_idqpp == NULL);
+ *IO_idqpp = dqp;
+
+ xfs_dqunlock(dqp);
+ xfs_dqunlock(udqhint);
+ return 0;
+ }
+
+ /*
+ * We can't hold a dquot lock when we call the dqget code.
+ * We'll deadlock in no time, because of (not conforming to)
+ * lock ordering - the inodelock comes before any dquot lock,
+ * and we may drop and reacquire the ilock in xfs_qm_dqget().
+ */
+ xfs_dqunlock(udqhint);
+ }
+
+ /*
+ * Find the dquot from somewhere. This bumps the
+ * reference count of dquot and returns it locked.
+ * This can return ENOENT if dquot didn't exist on
+ * disk and we didn't ask it to allocate;
+ * ESRCH if quotas got turned off suddenly.
+ */
+ error = xfs_qm_dqget(ip->i_mount, ip, id, type,
+ doalloc | XFS_QMOPT_DOWARN, &dqp);
+ if (error)
+ return error;
+
+ trace_xfs_dqattach_get(dqp);
+
+ /*
+ * dqget may have dropped and re-acquired the ilock, but it guarantees
+ * that the dquot returned is the one that should go in the inode.
+ */
+ *IO_idqpp = dqp;
+ xfs_dqunlock(dqp);
+ return 0;
+}
+
+
+/*
+ * Given a udquot and gdquot, attach a ptr to the group dquot in the
+ * udquot as a hint for future lookups. The idea sounds simple, but the
+ * execution isn't, because the udquot might have a group dquot attached
+ * already and getting rid of that gets us into lock ordering constraints.
+ * The process is complicated more by the fact that the dquots may or may not
+ * be locked on entry.
+ */
+STATIC void
+xfs_qm_dqattach_grouphint(
+ xfs_dquot_t *udq,
+ xfs_dquot_t *gdq)
+{
+ xfs_dquot_t *tmp;
+
+ xfs_dqlock(udq);
+
+ if ((tmp = udq->q_gdquot)) {
+ if (tmp == gdq) {
+ xfs_dqunlock(udq);
+ return;
+ }
+
+ udq->q_gdquot = NULL;
+ /*
+ * We can't keep any dqlocks when calling dqrele,
+ * because the freelist lock comes before dqlocks.
+ */
+ xfs_dqunlock(udq);
+ /*
+ * we took a hard reference once upon a time in dqget,
+ * so give it back when the udquot no longer points at it
+ * dqput() does the unlocking of the dquot.
+ */
+ xfs_qm_dqrele(tmp);
+
+ xfs_dqlock(udq);
+ xfs_dqlock(gdq);
+
+ } else {
+ ASSERT(XFS_DQ_IS_LOCKED(udq));
+ xfs_dqlock(gdq);
+ }
+
+ ASSERT(XFS_DQ_IS_LOCKED(udq));
+ ASSERT(XFS_DQ_IS_LOCKED(gdq));
+ /*
+ * Somebody could have attached a gdquot here,
+ * when we dropped the uqlock. If so, just do nothing.
+ */
+ if (udq->q_gdquot == NULL) {
+ XFS_DQHOLD(gdq);
+ udq->q_gdquot = gdq;
+ }
+
+ xfs_dqunlock(gdq);
+ xfs_dqunlock(udq);
+}
+
+
+/*
+ * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
+ * into account.
+ * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
+ * Inode may get unlocked and relocked in here, and the caller must deal with
+ * the consequences.
+ */
+int
+xfs_qm_dqattach_locked(
+ xfs_inode_t *ip,
+ uint flags)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ uint nquotas = 0;
+ int error = 0;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) ||
+ !XFS_IS_QUOTA_ON(mp) ||
+ !XFS_NOT_DQATTACHED(mp, ip) ||
+ ip->i_ino == mp->m_sb.sb_uquotino ||
+ ip->i_ino == mp->m_sb.sb_gquotino)
+ return 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (XFS_IS_UQUOTA_ON(mp)) {
+ error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
+ flags & XFS_QMOPT_DQALLOC,
+ NULL, &ip->i_udquot);
+ if (error)
+ goto done;
+ nquotas++;
+ }
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (XFS_IS_OQUOTA_ON(mp)) {
+ error = XFS_IS_GQUOTA_ON(mp) ?
+ xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+ flags & XFS_QMOPT_DQALLOC,
+ ip->i_udquot, &ip->i_gdquot) :
+ xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+ flags & XFS_QMOPT_DQALLOC,
+ ip->i_udquot, &ip->i_gdquot);
+ /*
+ * Don't worry about the udquot that we may have
+ * attached above. It'll get detached, if not already.
+ */
+ if (error)
+ goto done;
+ nquotas++;
+ }
+
+ /*
+ * Attach this group quota to the user quota as a hint.
+ * This WON'T, in general, result in a thrash.
+ */
+ if (nquotas == 2) {
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(ip->i_udquot);
+ ASSERT(ip->i_gdquot);
+
+ /*
+ * We may or may not have the i_udquot locked at this point,
+ * but this check is OK since we don't depend on the i_gdquot to
+ * be accurate 100% all the time. It is just a hint, and this
+ * will succeed in general.
+ */
+ if (ip->i_udquot->q_gdquot == ip->i_gdquot)
+ goto done;
+ /*
+ * Attach i_gdquot to the gdquot hint inside the i_udquot.
+ */
+ xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
+ }
+
+ done:
+#ifdef QUOTADEBUG
+ if (! error) {
+ if (XFS_IS_UQUOTA_ON(mp))
+ ASSERT(ip->i_udquot);
+ if (XFS_IS_OQUOTA_ON(mp))
+ ASSERT(ip->i_gdquot);
+ }
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+#endif
+ return error;
+}
+
+int
+xfs_qm_dqattach(
+ struct xfs_inode *ip,
+ uint flags)
+{
+ int error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_qm_dqattach_locked(ip, flags);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ return error;
+}
+
+/*
+ * Release dquots (and their references) if any.
+ * The inode should be locked EXCL except when this's called by
+ * xfs_ireclaim.
+ */
+void
+xfs_qm_dqdetach(
+ xfs_inode_t *ip)
+{
+ if (!(ip->i_udquot || ip->i_gdquot))
+ return;
+
+ trace_xfs_dquot_dqdetach(ip);
+
+ ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
+ ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+ if (ip->i_udquot) {
+ xfs_qm_dqrele(ip->i_udquot);
+ ip->i_udquot = NULL;
+ }
+ if (ip->i_gdquot) {
+ xfs_qm_dqrele(ip->i_gdquot);
+ ip->i_gdquot = NULL;
+ }
+}
+
+int
+xfs_qm_sync(
+ struct xfs_mount *mp,
+ int flags)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ int recl, restarts;
+ struct xfs_dquot *dqp;
+ int error;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ return 0;
+
+ restarts = 0;
+
+ again:
+ mutex_lock(&q->qi_dqlist_lock);
+ /*
+ * dqpurge_all() also takes the mplist lock and iterate thru all dquots
+ * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
+ * when we have the mplist lock, we know that dquots will be consistent
+ * as long as we have it locked.
+ */
+ if (!XFS_IS_QUOTA_ON(mp)) {
+ mutex_unlock(&q->qi_dqlist_lock);
+ return 0;
+ }
+ ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+ list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
+ /*
+ * If this is vfs_sync calling, then skip the dquots that
+ * don't 'seem' to be dirty. ie. don't acquire dqlock.
+ * This is very similar to what xfs_sync does with inodes.
+ */
+ if (flags & SYNC_TRYLOCK) {
+ if (!XFS_DQ_IS_DIRTY(dqp))
+ continue;
+ if (!xfs_qm_dqlock_nowait(dqp))
+ continue;
+ } else {
+ xfs_dqlock(dqp);
+ }
+
+ /*
+ * Now, find out for sure if this dquot is dirty or not.
+ */
+ if (! XFS_DQ_IS_DIRTY(dqp)) {
+ xfs_dqunlock(dqp);
+ continue;
+ }
+
+ /* XXX a sentinel would be better */
+ recl = q->qi_dqreclaims;
+ if (!xfs_dqflock_nowait(dqp)) {
+ if (flags & SYNC_TRYLOCK) {
+ xfs_dqunlock(dqp);
+ continue;
+ }
+ /*
+ * If we can't grab the flush lock then if the caller
+ * really wanted us to give this our best shot, so
+ * see if we can give a push to the buffer before we wait
+ * on the flush lock. At this point, we know that
+ * even though the dquot is being flushed,
+ * it has (new) dirty data.
+ */
+ xfs_qm_dqflock_pushbuf_wait(dqp);
+ }
+ /*
+ * Let go of the mplist lock. We don't want to hold it
+ * across a disk write
+ */
+ mutex_unlock(&q->qi_dqlist_lock);
+ error = xfs_qm_dqflush(dqp, flags);
+ xfs_dqunlock(dqp);
+ if (error && XFS_FORCED_SHUTDOWN(mp))
+ return 0; /* Need to prevent umount failure */
+ else if (error)
+ return error;
+
+ mutex_lock(&q->qi_dqlist_lock);
+ if (recl != q->qi_dqreclaims) {
+ if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
+ break;
+
+ mutex_unlock(&q->qi_dqlist_lock);
+ goto again;
+ }
+ }
+
+ mutex_unlock(&q->qi_dqlist_lock);
+ return 0;
+}
+
+/*
+ * The hash chains and the mplist use the same xfs_dqhash structure as
+ * their list head, but we can take the mplist qh_lock and one of the
+ * hash qh_locks at the same time without any problem as they aren't
+ * related.
+ */
+static struct lock_class_key xfs_quota_mplist_class;
+
+/*
+ * This initializes all the quota information that's kept in the
+ * mount structure
+ */
+STATIC int
+xfs_qm_init_quotainfo(
+ xfs_mount_t *mp)
+{
+ xfs_quotainfo_t *qinf;
+ int error;
+ xfs_dquot_t *dqp;
+
+ ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+ /*
+ * Tell XQM that we exist as soon as possible.
+ */
+ if ((error = xfs_qm_hold_quotafs_ref(mp))) {
+ return error;
+ }
+
+ qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+
+ /*
+ * See if quotainodes are setup, and if not, allocate them,
+ * and change the superblock accordingly.
+ */
+ if ((error = xfs_qm_init_quotainos(mp))) {
+ kmem_free(qinf);
+ mp->m_quotainfo = NULL;
+ return error;
+ }
+
+ INIT_LIST_HEAD(&qinf->qi_dqlist);
+ mutex_init(&qinf->qi_dqlist_lock);
+ lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
+
+ qinf->qi_dqreclaims = 0;
+
+ /* mutex used to serialize quotaoffs */
+ mutex_init(&qinf->qi_quotaofflock);
+
+ /* Precalc some constants */
+ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+ ASSERT(qinf->qi_dqchunklen);
+ qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
+ do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
+
+ mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
+
+ /*
+ * We try to get the limits from the superuser's limits fields.
+ * This is quite hacky, but it is standard quota practice.
+ * We look at the USR dquot with id == 0 first, but if user quotas
+ * are not enabled we goto the GRP dquot with id == 0.
+ * We don't really care to keep separate default limits for user
+ * and group quotas, at least not at this point.
+ */
+ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
+ XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
+ (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
+ XFS_DQ_PROJ),
+ XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
+ &dqp);
+ if (! error) {
+ xfs_disk_dquot_t *ddqp = &dqp->q_core;
+
+ /*
+ * The warnings and timers set the grace period given to
+ * a user or group before he or she can not perform any
+ * more writing. If it is zero, a default is used.
+ */
+ qinf->qi_btimelimit = ddqp->d_btimer ?
+ be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT;
+ qinf->qi_itimelimit = ddqp->d_itimer ?
+ be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT;
+ qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ?
+ be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT;
+ qinf->qi_bwarnlimit = ddqp->d_bwarns ?
+ be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT;
+ qinf->qi_iwarnlimit = ddqp->d_iwarns ?
+ be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
+ qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
+ be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
+ qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+ qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+ qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+ qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+ qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+ qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+
+ /*
+ * We sent the XFS_QMOPT_DQSUSER flag to dqget because
+ * we don't want this dquot cached. We haven't done a
+ * quotacheck yet, and quotacheck doesn't like incore dquots.
+ */
+ xfs_qm_dqdestroy(dqp);
+ } else {
+ qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
+ qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
+ qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
+ qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
+ qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
+ qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
+ }
+
+ return 0;
+}
+
+
+/*
+ * Gets called when unmounting a filesystem or when all quotas get
+ * turned off.
+ * This purges the quota inodes, destroys locks and frees itself.
+ */
+void
+xfs_qm_destroy_quotainfo(
+ xfs_mount_t *mp)
+{
+ xfs_quotainfo_t *qi;
+
+ qi = mp->m_quotainfo;
+ ASSERT(qi != NULL);
+ ASSERT(xfs_Gqm != NULL);
+
+ /*
+ * Release the reference that XQM kept, so that we know
+ * when the XQM structure should be freed. We cannot assume
+ * that xfs_Gqm is non-null after this point.
+ */
+ xfs_qm_rele_quotafs_ref(mp);
+
+ ASSERT(list_empty(&qi->qi_dqlist));
+ mutex_destroy(&qi->qi_dqlist_lock);
+
+ if (qi->qi_uquotaip) {
+ IRELE(qi->qi_uquotaip);
+ qi->qi_uquotaip = NULL; /* paranoia */
+ }
+ if (qi->qi_gquotaip) {
+ IRELE(qi->qi_gquotaip);
+ qi->qi_gquotaip = NULL;
+ }
+ mutex_destroy(&qi->qi_quotaofflock);
+ kmem_free(qi);
+ mp->m_quotainfo = NULL;
+}
+
+
+
+/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
+
+/* ARGSUSED */
+STATIC void
+xfs_qm_list_init(
+ xfs_dqlist_t *list,
+ char *str,
+ int n)
+{
+ mutex_init(&list->qh_lock);
+ INIT_LIST_HEAD(&list->qh_list);
+ list->qh_version = 0;
+ list->qh_nelems = 0;
+}
+
+STATIC void
+xfs_qm_list_destroy(
+ xfs_dqlist_t *list)
+{
+ mutex_destroy(&(list->qh_lock));
+}
+
+/*
+ * Create an inode and return with a reference already taken, but unlocked
+ * This is how we create quota inodes
+ */
+STATIC int
+xfs_qm_qino_alloc(
+ xfs_mount_t *mp,
+ xfs_inode_t **ip,
+ __int64_t sbfields,
+ uint flags)
+{
+ xfs_trans_t *tp;
+ int error;
+ int committed;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
+ if ((error = xfs_trans_reserve(tp,
+ XFS_QM_QINOCREATE_SPACE_RES(mp),
+ XFS_CREATE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_CREATE_LOG_COUNT))) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
+ if (error) {
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+ XFS_TRANS_ABORT);
+ return error;
+ }
+
+ /*
+ * Make the changes in the superblock, and log those too.
+ * sbfields arg may contain fields other than *QUOTINO;
+ * VERSIONNUM for example.
+ */
+ spin_lock(&mp->m_sb_lock);
+ if (flags & XFS_QMOPT_SBVERSION) {
+ ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
+ ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+ XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
+ (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+ XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
+
+ xfs_sb_version_addquota(&mp->m_sb);
+ mp->m_sb.sb_uquotino = NULLFSINO;
+ mp->m_sb.sb_gquotino = NULLFSINO;
+
+ /* qflags will get updated _after_ quotacheck */
+ mp->m_sb.sb_qflags = 0;
+ }
+ if (flags & XFS_QMOPT_UQUOTA)
+ mp->m_sb.sb_uquotino = (*ip)->i_ino;
+ else
+ mp->m_sb.sb_gquotino = (*ip)->i_ino;
+ spin_unlock(&mp->m_sb_lock);
+ xfs_mod_sb(tp, sbfields);
+
+ if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
+ xfs_alert(mp, "%s failed (error %d)!", __func__, error);
+ return error;
+ }
+ return 0;
+}
+
+
+STATIC void
+xfs_qm_reset_dqcounts(
+ xfs_mount_t *mp,
+ xfs_buf_t *bp,
+ xfs_dqid_t id,
+ uint type)
+{
+ xfs_disk_dquot_t *ddq;
+ int j;
+
+ trace_xfs_reset_dqcounts(bp, _RET_IP_);
+
+ /*
+ * Reset all counters and timers. They'll be
+ * started afresh by xfs_qm_quotacheck.
+ */
+#ifdef DEBUG
+ j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+ do_div(j, sizeof(xfs_dqblk_t));
+ ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
+#endif
+ ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
+ for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
+ /*
+ * Do a sanity check, and if needed, repair the dqblk. Don't
+ * output any warnings because it's perfectly possible to
+ * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
+ */
+ (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+ "xfs_quotacheck");
+ ddq->d_bcount = 0;
+ ddq->d_icount = 0;
+ ddq->d_rtbcount = 0;
+ ddq->d_btimer = 0;
+ ddq->d_itimer = 0;
+ ddq->d_rtbtimer = 0;
+ ddq->d_bwarns = 0;
+ ddq->d_iwarns = 0;
+ ddq->d_rtbwarns = 0;
+ ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
+ }
+}
+
+STATIC int
+xfs_qm_dqiter_bufs(
+ xfs_mount_t *mp,
+ xfs_dqid_t firstid,
+ xfs_fsblock_t bno,
+ xfs_filblks_t blkcnt,
+ uint flags)
+{
+ xfs_buf_t *bp;
+ int error;
+ int type;
+
+ ASSERT(blkcnt > 0);
+ type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
+ (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
+ error = 0;
+
+ /*
+ * Blkcnt arg can be a very big number, and might even be
+ * larger than the log itself. So, we have to break it up into
+ * manageable-sized transactions.
+ * Note that we don't start a permanent transaction here; we might
+ * not be able to get a log reservation for the whole thing up front,
+ * and we don't really care to either, because we just discard
+ * everything if we were to crash in the middle of this loop.
+ */
+ while (blkcnt--) {
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, bno),
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ if (error)
+ break;
+
+ xfs_qm_reset_dqcounts(mp, bp, firstid, type);
+ xfs_bdwrite(mp, bp);
+ /*
+ * goto the next block.
+ */
+ bno++;
+ firstid += mp->m_quotainfo->qi_dqperchunk;
+ }
+ return error;
+}
+
+/*
+ * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a
+ * caller supplied function for every chunk of dquots that we find.
+ */
+STATIC int
+xfs_qm_dqiterate(
+ xfs_mount_t *mp,
+ xfs_inode_t *qip,
+ uint flags)
+{
+ xfs_bmbt_irec_t *map;
+ int i, nmaps; /* number of map entries */
+ int error; /* return value */
+ xfs_fileoff_t lblkno;
+ xfs_filblks_t maxlblkcnt;
+ xfs_dqid_t firstid;
+ xfs_fsblock_t rablkno;
+ xfs_filblks_t rablkcnt;
+
+ error = 0;
+ /*
+ * This looks racy, but we can't keep an inode lock across a
+ * trans_reserve. But, this gets called during quotacheck, and that
+ * happens only at mount time which is single threaded.
+ */
+ if (qip->i_d.di_nblocks == 0)
+ return 0;
+
+ map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+
+ lblkno = 0;
+ maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+ do {
+ nmaps = XFS_DQITER_MAP_SIZE;
+ /*
+ * We aren't changing the inode itself. Just changing
+ * some of its data. No new blocks are added here, and
+ * the inode is never added to the transaction.
+ */
+ xfs_ilock(qip, XFS_ILOCK_SHARED);
+ error = xfs_bmapi(NULL, qip, lblkno,
+ maxlblkcnt - lblkno,
+ XFS_BMAPI_METADATA,
+ NULL,
+ 0, map, &nmaps, NULL);
+ xfs_iunlock(qip, XFS_ILOCK_SHARED);
+ if (error)
+ break;
+
+ ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
+ for (i = 0; i < nmaps; i++) {
+ ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+ ASSERT(map[i].br_blockcount);
+
+
+ lblkno += map[i].br_blockcount;
+
+ if (map[i].br_startblock == HOLESTARTBLOCK)
+ continue;
+
+ firstid = (xfs_dqid_t) map[i].br_startoff *
+ mp->m_quotainfo->qi_dqperchunk;
+ /*
+ * Do a read-ahead on the next extent.
+ */
+ if ((i+1 < nmaps) &&
+ (map[i+1].br_startblock != HOLESTARTBLOCK)) {
+ rablkcnt = map[i+1].br_blockcount;
+ rablkno = map[i+1].br_startblock;
+ while (rablkcnt--) {
+ xfs_buf_readahead(mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, rablkno),
+ mp->m_quotainfo->qi_dqchunklen);
+ rablkno++;
+ }
+ }
+ /*
+ * Iterate thru all the blks in the extent and
+ * reset the counters of all the dquots inside them.
+ */
+ if ((error = xfs_qm_dqiter_bufs(mp,
+ firstid,
+ map[i].br_startblock,
+ map[i].br_blockcount,
+ flags))) {
+ break;
+ }
+ }
+
+ if (error)
+ break;
+ } while (nmaps > 0);
+
+ kmem_free(map);
+
+ return error;
+}
+
+/*
+ * Called by dqusage_adjust in doing a quotacheck.
+ *
+ * Given the inode, and a dquot id this updates both the incore dqout as well
+ * as the buffer copy. This is so that once the quotacheck is done, we can
+ * just log all the buffers, as opposed to logging numerous updates to
+ * individual dquots.
+ */
+STATIC int
+xfs_qm_quotacheck_dqadjust(
+ struct xfs_inode *ip,
+ xfs_dqid_t id,
+ uint type,
+ xfs_qcnt_t nblks,
+ xfs_qcnt_t rtblks)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_dquot *dqp;
+ int error;
+
+ error = xfs_qm_dqget(mp, ip, id, type,
+ XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
+ if (error) {
+ /*
+ * Shouldn't be able to turn off quotas here.
+ */
+ ASSERT(error != ESRCH);
+ ASSERT(error != ENOENT);
+ return error;
+ }
+
+ trace_xfs_dqadjust(dqp);
+
+ /*
+ * Adjust the inode count and the block count to reflect this inode's
+ * resource usage.
+ */
+ be64_add_cpu(&dqp->q_core.d_icount, 1);
+ dqp->q_res_icount++;
+ if (nblks) {
+ be64_add_cpu(&dqp->q_core.d_bcount, nblks);
+ dqp->q_res_bcount += nblks;
+ }
+ if (rtblks) {
+ be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks);
+ dqp->q_res_rtbcount += rtblks;
+ }
+
+ /*
+ * Set default limits, adjust timers (since we changed usages)
+ *
+ * There are no timers for the default values set in the root dquot.
+ */
+ if (dqp->q_core.d_id) {
+ xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
+ xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
+ }
+
+ dqp->dq_flags |= XFS_DQ_DIRTY;
+ xfs_qm_dqput(dqp);
+ return 0;
+}
+
+STATIC int
+xfs_qm_get_rtblks(
+ xfs_inode_t *ip,
+ xfs_qcnt_t *O_rtblks)
+{
+ xfs_filblks_t rtblks; /* total rt blks */
+ xfs_extnum_t idx; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extnum_t nextents; /* number of extent entries */
+ int error;
+
+ ASSERT(XFS_IS_REALTIME_INODE(ip));
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
+ return error;
+ }
+ rtblks = 0;
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ for (idx = 0; idx < nextents; idx++)
+ rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx));
+ *O_rtblks = (xfs_qcnt_t)rtblks;
+ return 0;
+}
+
+/*
+ * callback routine supplied to bulkstat(). Given an inumber, find its
+ * dquots and update them to account for resources taken by that inode.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqusage_adjust(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t ino, /* inode number to get data for */
+ void __user *buffer, /* not used */
+ int ubsize, /* not used */
+ int *ubused, /* not used */
+ int *res) /* result code value */
+{
+ xfs_inode_t *ip;
+ xfs_qcnt_t nblks, rtblks = 0;
+ int error;
+
+ ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+ /*
+ * rootino must have its resources accounted for, not so with the quota
+ * inodes.
+ */
+ if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+ *res = BULKSTAT_RV_NOTHING;
+ return XFS_ERROR(EINVAL);
+ }
+
+ /*
+ * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
+ * interface expects the inode to be exclusively locked because that's
+ * the case in all other instances. It's OK that we do this because
+ * quotacheck is done only at mount time.
+ */
+ error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+ if (error) {
+ *res = BULKSTAT_RV_NOTHING;
+ return error;
+ }
+
+ ASSERT(ip->i_delayed_blks == 0);
+
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ /*
+ * Walk thru the extent list and count the realtime blocks.
+ */
+ error = xfs_qm_get_rtblks(ip, &rtblks);
+ if (error)
+ goto error0;
+ }
+
+ nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
+
+ /*
+ * Add the (disk blocks and inode) resources occupied by this
+ * inode to its dquots. We do this adjustment in the incore dquot,
+ * and also copy the changes to its buffer.
+ * We don't care about putting these changes in a transaction
+ * envelope because if we crash in the middle of a 'quotacheck'
+ * we have to start from the beginning anyway.
+ * Once we're done, we'll log all the dquot bufs.
+ *
+ * The *QUOTA_ON checks below may look pretty racy, but quotachecks
+ * and quotaoffs don't race. (Quotachecks happen at mount time only).
+ */
+ if (XFS_IS_UQUOTA_ON(mp)) {
+ error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
+ XFS_DQ_USER, nblks, rtblks);
+ if (error)
+ goto error0;
+ }
+
+ if (XFS_IS_GQUOTA_ON(mp)) {
+ error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
+ XFS_DQ_GROUP, nblks, rtblks);
+ if (error)
+ goto error0;
+ }
+
+ if (XFS_IS_PQUOTA_ON(mp)) {
+ error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
+ XFS_DQ_PROJ, nblks, rtblks);
+ if (error)
+ goto error0;
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ IRELE(ip);
+ *res = BULKSTAT_RV_DIDONE;
+ return 0;
+
+error0:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ IRELE(ip);
+ *res = BULKSTAT_RV_GIVEUP;
+ return error;
+}
+
+/*
+ * Walk thru all the filesystem inodes and construct a consistent view
+ * of the disk quota world. If the quotacheck fails, disable quotas.
+ */
+int
+xfs_qm_quotacheck(
+ xfs_mount_t *mp)
+{
+ int done, count, error;
+ xfs_ino_t lastino;
+ size_t structsz;
+ xfs_inode_t *uip, *gip;
+ uint flags;
+
+ count = INT_MAX;
+ structsz = 1;
+ lastino = 0;
+ flags = 0;
+
+ ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
+ ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+ /*
+ * There should be no cached dquots. The (simplistic) quotacheck
+ * algorithm doesn't like that.
+ */
+ ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
+
+ xfs_notice(mp, "Quotacheck needed: Please wait.");
+
+ /*
+ * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
+ * their counters to zero. We need a clean slate.
+ * We don't log our changes till later.
+ */
+ uip = mp->m_quotainfo->qi_uquotaip;
+ if (uip) {
+ error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
+ if (error)
+ goto error_return;
+ flags |= XFS_UQUOTA_CHKD;
+ }
+
+ gip = mp->m_quotainfo->qi_gquotaip;
+ if (gip) {
+ error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+ XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+ if (error)
+ goto error_return;
+ flags |= XFS_OQUOTA_CHKD;
+ }
+
+ do {
+ /*
+ * Iterate thru all the inodes in the file system,
+ * adjusting the corresponding dquot counters in core.
+ */
+ error = xfs_bulkstat(mp, &lastino, &count,
+ xfs_qm_dqusage_adjust,
+ structsz, NULL, &done);
+ if (error)
+ break;
+
+ } while (!done);
+
+ /*
+ * We've made all the changes that we need to make incore.
+ * Flush them down to disk buffers if everything was updated
+ * successfully.
+ */
+ if (!error)
+ error = xfs_qm_dqflush_all(mp, 0);
+
+ /*
+ * We can get this error if we couldn't do a dquot allocation inside
+ * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
+ * dirty dquots that might be cached, we just want to get rid of them
+ * and turn quotaoff. The dquots won't be attached to any of the inodes
+ * at this point (because we intentionally didn't in dqget_noattach).
+ */
+ if (error) {
+ xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+ goto error_return;
+ }
+
+ /*
+ * We didn't log anything, because if we crashed, we'll have to
+ * start the quotacheck from scratch anyway. However, we must make
+ * sure that our dquot changes are secure before we put the
+ * quotacheck'd stamp on the superblock. So, here we do a synchronous
+ * flush.
+ */
+ XFS_bflush(mp->m_ddev_targp);
+
+ /*
+ * If one type of quotas is off, then it will lose its
+ * quotachecked status, since we won't be doing accounting for
+ * that type anymore.
+ */
+ mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
+ mp->m_qflags |= flags;
+
+ xfs_qm_dquot_list_print(mp);
+
+ error_return:
+ if (error) {
+ xfs_warn(mp,
+ "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
+ error);
+ /*
+ * We must turn off quotas.
+ */
+ ASSERT(mp->m_quotainfo != NULL);
+ ASSERT(xfs_Gqm != NULL);
+ xfs_qm_destroy_quotainfo(mp);
+ if (xfs_mount_reset_sbqflags(mp)) {
+ xfs_warn(mp,
+ "Quotacheck: Failed to reset quota flags.");
+ }
+ } else
+ xfs_notice(mp, "Quotacheck: Done.");
+ return (error);
+}
+
+/*
+ * This is called after the superblock has been read in and we're ready to
+ * iget the quota inodes.
+ */
+STATIC int
+xfs_qm_init_quotainos(
+ xfs_mount_t *mp)
+{
+ xfs_inode_t *uip, *gip;
+ int error;
+ __int64_t sbflags;
+ uint flags;
+
+ ASSERT(mp->m_quotainfo);
+ uip = gip = NULL;
+ sbflags = 0;
+ flags = 0;
+
+ /*
+ * Get the uquota and gquota inodes
+ */
+ if (xfs_sb_version_hasquota(&mp->m_sb)) {
+ if (XFS_IS_UQUOTA_ON(mp) &&
+ mp->m_sb.sb_uquotino != NULLFSINO) {
+ ASSERT(mp->m_sb.sb_uquotino > 0);
+ if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+ 0, 0, &uip)))
+ return XFS_ERROR(error);
+ }
+ if (XFS_IS_OQUOTA_ON(mp) &&
+ mp->m_sb.sb_gquotino != NULLFSINO) {
+ ASSERT(mp->m_sb.sb_gquotino > 0);
+ if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+ 0, 0, &gip))) {
+ if (uip)
+ IRELE(uip);
+ return XFS_ERROR(error);
+ }
+ }
+ } else {
+ flags |= XFS_QMOPT_SBVERSION;
+ sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+ XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+ }
+
+ /*
+ * Create the two inodes, if they don't exist already. The changes
+ * made above will get added to a transaction and logged in one of
+ * the qino_alloc calls below. If the device is readonly,
+ * temporarily switch to read-write to do this.
+ */
+ if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
+ if ((error = xfs_qm_qino_alloc(mp, &uip,
+ sbflags | XFS_SB_UQUOTINO,
+ flags | XFS_QMOPT_UQUOTA)))
+ return XFS_ERROR(error);
+
+ flags &= ~XFS_QMOPT_SBVERSION;
+ }
+ if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) {
+ flags |= (XFS_IS_GQUOTA_ON(mp) ?
+ XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+ error = xfs_qm_qino_alloc(mp, &gip,
+ sbflags | XFS_SB_GQUOTINO, flags);
+ if (error) {
+ if (uip)
+ IRELE(uip);
+
+ return XFS_ERROR(error);
+ }
+ }
+
+ mp->m_quotainfo->qi_uquotaip = uip;
+ mp->m_quotainfo->qi_gquotaip = gip;
+
+ return 0;
+}
+
+
+
+/*
+ * Just pop the least recently used dquot off the freelist and
+ * recycle it. The returned dquot is locked.
+ */
+STATIC xfs_dquot_t *
+xfs_qm_dqreclaim_one(void)
+{
+ xfs_dquot_t *dqpout;
+ xfs_dquot_t *dqp;
+ int restarts;
+ int startagain;
+
+ restarts = 0;
+ dqpout = NULL;
+
+ /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
+again:
+ startagain = 0;
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+
+ list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
+ struct xfs_mount *mp = dqp->q_mount;
+ xfs_dqlock(dqp);
+
+ /*
+ * We are racing with dqlookup here. Naturally we don't
+ * want to reclaim a dquot that lookup wants. We release the
+ * freelist lock and start over, so that lookup will grab
+ * both the dquot and the freelistlock.
+ */
+ if (dqp->dq_flags & XFS_DQ_WANT) {
+ ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+
+ trace_xfs_dqreclaim_want(dqp);
+ XQM_STATS_INC(xqmstats.xs_qm_dqwants);
+ restarts++;
+ startagain = 1;
+ goto dqunlock;
+ }
+
+ /*
+ * If the dquot is inactive, we are assured that it is
+ * not on the mplist or the hashlist, and that makes our
+ * life easier.
+ */
+ if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+ ASSERT(mp == NULL);
+ ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+ ASSERT(list_empty(&dqp->q_hashlist));
+ ASSERT(list_empty(&dqp->q_mplist));
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
+ dqpout = dqp;
+ XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
+ goto dqunlock;
+ }
+
+ ASSERT(dqp->q_hash);
+ ASSERT(!list_empty(&dqp->q_mplist));
+
+ /*
+ * Try to grab the flush lock. If this dquot is in the process
+ * of getting flushed to disk, we don't want to reclaim it.
+ */
+ if (!xfs_dqflock_nowait(dqp))
+ goto dqunlock;
+
+ /*
+ * We have the flush lock so we know that this is not in the
+ * process of being flushed. So, if this is dirty, flush it
+ * DELWRI so that we don't get a freelist infested with
+ * dirty dquots.
+ */
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ int error;
+
+ trace_xfs_dqreclaim_dirty(dqp);
+
+ /*
+ * We flush it delayed write, so don't bother
+ * releasing the freelist lock.
+ */
+ error = xfs_qm_dqflush(dqp, 0);
+ if (error) {
+ xfs_warn(mp, "%s: dquot %p flush failed",
+ __func__, dqp);
+ }
+ goto dqunlock;
+ }
+
+ /*
+ * We're trying to get the hashlock out of order. This races
+ * with dqlookup; so, we giveup and goto the next dquot if
+ * we couldn't get the hashlock. This way, we won't starve
+ * a dqlookup process that holds the hashlock that is
+ * waiting for the freelist lock.
+ */
+ if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
+ restarts++;
+ goto dqfunlock;
+ }
+
+ /*
+ * This races with dquot allocation code as well as dqflush_all
+ * and reclaim code. So, if we failed to grab the mplist lock,
+ * giveup everything and start over.
+ */
+ if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
+ restarts++;
+ startagain = 1;
+ goto qhunlock;
+ }
+
+ ASSERT(dqp->q_nrefs == 0);
+ list_del_init(&dqp->q_mplist);
+ mp->m_quotainfo->qi_dquots--;
+ mp->m_quotainfo->qi_dqreclaims++;
+ list_del_init(&dqp->q_hashlist);
+ dqp->q_hash->qh_version++;
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
+ dqpout = dqp;
+ mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+qhunlock:
+ mutex_unlock(&dqp->q_hash->qh_lock);
+dqfunlock:
+ xfs_dqfunlock(dqp);
+dqunlock:
+ xfs_dqunlock(dqp);
+ if (dqpout)
+ break;
+ if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+ break;
+ if (startagain) {
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+ goto again;
+ }
+ }
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+ return dqpout;
+}
+
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ */
+STATIC int
+xfs_qm_shake_freelist(
+ int howmany)
+{
+ int nreclaimed = 0;
+ xfs_dquot_t *dqp;
+
+ if (howmany <= 0)
+ return 0;
+
+ while (nreclaimed < howmany) {
+ dqp = xfs_qm_dqreclaim_one();
+ if (!dqp)
+ return nreclaimed;
+ xfs_qm_dqdestroy(dqp);
+ nreclaimed++;
+ }
+ return nreclaimed;
+}
+
+/*
+ * The kmem_shake interface is invoked when memory is running low.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_shake(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ int ndqused, nfree, n;
+ gfp_t gfp_mask = sc->gfp_mask;
+
+ if (!kmem_shake_allow(gfp_mask))
+ return 0;
+ if (!xfs_Gqm)
+ return 0;
+
+ nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
+ /* incore dquots in all f/s's */
+ ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
+
+ ASSERT(ndqused >= 0);
+
+ if (nfree <= ndqused && nfree < ndquot)
+ return 0;
+
+ ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */
+ n = nfree - ndqused - ndquot; /* # over target */
+
+ return xfs_qm_shake_freelist(MAX(nfree, n));
+}
+
+
+/*------------------------------------------------------------------*/
+
+/*
+ * Return a new incore dquot. Depending on the number of
+ * dquots in the system, we either allocate a new one on the kernel heap,
+ * or reclaim a free one.
+ * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
+ * to reclaim an existing one from the freelist.
+ */
+boolean_t
+xfs_qm_dqalloc_incore(
+ xfs_dquot_t **O_dqpp)
+{
+ xfs_dquot_t *dqp;
+
+ /*
+ * Check against high water mark to see if we want to pop
+ * a nincompoop dquot off the freelist.
+ */
+ if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
+ /*
+ * Try to recycle a dquot from the freelist.
+ */
+ if ((dqp = xfs_qm_dqreclaim_one())) {
+ XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
+ /*
+ * Just zero the core here. The rest will get
+ * reinitialized by caller. XXX we shouldn't even
+ * do this zero ...
+ */
+ memset(&dqp->q_core, 0, sizeof(dqp->q_core));
+ *O_dqpp = dqp;
+ return B_FALSE;
+ }
+ XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
+ }
+
+ /*
+ * Allocate a brand new dquot on the kernel heap and return it
+ * to the caller to initialize.
+ */
+ ASSERT(xfs_Gqm->qm_dqzone != NULL);
+ *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
+ atomic_inc(&xfs_Gqm->qm_totaldquots);
+
+ return B_TRUE;
+}
+
+
+/*
+ * Start a transaction and write the incore superblock changes to
+ * disk. flags parameter indicates which fields have changed.
+ */
+int
+xfs_qm_write_sb_changes(
+ xfs_mount_t *mp,
+ __int64_t flags)
+{
+ xfs_trans_t *tp;
+ int error;
+
+#ifdef QUOTADEBUG
+ xfs_notice(mp, "Writing superblock quota changes");
+#endif
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+ if ((error = xfs_trans_reserve(tp, 0,
+ mp->m_sb.sb_sectsize + 128, 0,
+ 0,
+ XFS_DEFAULT_LOG_COUNT))) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ xfs_mod_sb(tp, flags);
+ error = xfs_trans_commit(tp, 0);
+
+ return error;
+}
+
+
+/* --------------- utility functions for vnodeops ---------------- */
+
+
+/*
+ * Given an inode, a uid, gid and prid make sure that we have
+ * allocated relevant dquot(s) on disk, and that we won't exceed inode
+ * quotas by creating this file.
+ * This also attaches dquot(s) to the given inode after locking it,
+ * and returns the dquots corresponding to the uid and/or gid.
+ *
+ * in : inode (unlocked)
+ * out : udquot, gdquot with references taken and unlocked
+ */
+int
+xfs_qm_vop_dqalloc(
+ struct xfs_inode *ip,
+ uid_t uid,
+ gid_t gid,
+ prid_t prid,
+ uint flags,
+ struct xfs_dquot **O_udqpp,
+ struct xfs_dquot **O_gdqpp)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_dquot *uq, *gq;
+ int error;
+ uint lockflags;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ return 0;
+
+ lockflags = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lockflags);
+
+ if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
+ gid = ip->i_d.di_gid;
+
+ /*
+ * Attach the dquot(s) to this inode, doing a dquot allocation
+ * if necessary. The dquot(s) will not be locked.
+ */
+ if (XFS_NOT_DQATTACHED(mp, ip)) {
+ error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
+ if (error) {
+ xfs_iunlock(ip, lockflags);
+ return error;
+ }
+ }
+
+ uq = gq = NULL;
+ if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
+ if (ip->i_d.di_uid != uid) {
+ /*
+ * What we need is the dquot that has this uid, and
+ * if we send the inode to dqget, the uid of the inode
+ * takes priority over what's sent in the uid argument.
+ * We must unlock inode here before calling dqget if
+ * we're not sending the inode, because otherwise
+ * we'll deadlock by doing trans_reserve while
+ * holding ilock.
+ */
+ xfs_iunlock(ip, lockflags);
+ if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+ XFS_DQ_USER,
+ XFS_QMOPT_DQALLOC |
+ XFS_QMOPT_DOWARN,
+ &uq))) {
+ ASSERT(error != ENOENT);
+ return error;
+ }
+ /*
+ * Get the ilock in the right order.
+ */
+ xfs_dqunlock(uq);
+ lockflags = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, lockflags);
+ } else {
+ /*
+ * Take an extra reference, because we'll return
+ * this to caller
+ */
+ ASSERT(ip->i_udquot);
+ uq = ip->i_udquot;
+ xfs_dqlock(uq);
+ XFS_DQHOLD(uq);
+ xfs_dqunlock(uq);
+ }
+ }
+ if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
+ if (ip->i_d.di_gid != gid) {
+ xfs_iunlock(ip, lockflags);
+ if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+ XFS_DQ_GROUP,
+ XFS_QMOPT_DQALLOC |
+ XFS_QMOPT_DOWARN,
+ &gq))) {
+ if (uq)
+ xfs_qm_dqrele(uq);
+ ASSERT(error != ENOENT);
+ return error;
+ }
+ xfs_dqunlock(gq);
+ lockflags = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, lockflags);
+ } else {
+ ASSERT(ip->i_gdquot);
+ gq = ip->i_gdquot;
+ xfs_dqlock(gq);
+ XFS_DQHOLD(gq);
+ xfs_dqunlock(gq);
+ }
+ } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
+ if (xfs_get_projid(ip) != prid) {
+ xfs_iunlock(ip, lockflags);
+ if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
+ XFS_DQ_PROJ,
+ XFS_QMOPT_DQALLOC |
+ XFS_QMOPT_DOWARN,
+ &gq))) {
+ if (uq)
+ xfs_qm_dqrele(uq);
+ ASSERT(error != ENOENT);
+ return (error);
+ }
+ xfs_dqunlock(gq);
+ lockflags = XFS_ILOCK_SHARED;
+ xfs_ilock(ip, lockflags);
+ } else {
+ ASSERT(ip->i_gdquot);
+ gq = ip->i_gdquot;
+ xfs_dqlock(gq);
+ XFS_DQHOLD(gq);
+ xfs_dqunlock(gq);
+ }
+ }
+ if (uq)
+ trace_xfs_dquot_dqalloc(ip);
+
+ xfs_iunlock(ip, lockflags);
+ if (O_udqpp)
+ *O_udqpp = uq;
+ else if (uq)
+ xfs_qm_dqrele(uq);
+ if (O_gdqpp)
+ *O_gdqpp = gq;
+ else if (gq)
+ xfs_qm_dqrele(gq);
+ return 0;
+}
+
+/*
+ * Actually transfer ownership, and do dquot modifications.
+ * These were already reserved.
+ */
+xfs_dquot_t *
+xfs_qm_vop_chown(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip,
+ xfs_dquot_t **IO_olddq,
+ xfs_dquot_t *newdq)
+{
+ xfs_dquot_t *prevdq;
+ uint bfield = XFS_IS_REALTIME_INODE(ip) ?
+ XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
+
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+
+ /* old dquot */
+ prevdq = *IO_olddq;
+ ASSERT(prevdq);
+ ASSERT(prevdq != newdq);
+
+ xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks));
+ xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
+
+ /* the sparkling new dquot */
+ xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks);
+ xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
+
+ /*
+ * Take an extra reference, because the inode
+ * is going to keep this dquot pointer even
+ * after the trans_commit.
+ */
+ xfs_dqlock(newdq);
+ XFS_DQHOLD(newdq);
+ xfs_dqunlock(newdq);
+ *IO_olddq = newdq;
+
+ return prevdq;
+}
+
+/*
+ * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
+ */
+int
+xfs_qm_vop_chown_reserve(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip,
+ xfs_dquot_t *udqp,
+ xfs_dquot_t *gdqp,
+ uint flags)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ uint delblks, blkflags, prjflags = 0;
+ xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq;
+ int error;
+
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+ delblks = ip->i_delayed_blks;
+ delblksudq = delblksgdq = unresudq = unresgdq = NULL;
+ blkflags = XFS_IS_REALTIME_INODE(ip) ?
+ XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
+
+ if (XFS_IS_UQUOTA_ON(mp) && udqp &&
+ ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
+ delblksudq = udqp;
+ /*
+ * If there are delayed allocation blocks, then we have to
+ * unreserve those from the old dquot, and add them to the
+ * new dquot.
+ */
+ if (delblks) {
+ ASSERT(ip->i_udquot);
+ unresudq = ip->i_udquot;
+ }
+ }
+ if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
+ if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
+ xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
+ prjflags = XFS_QMOPT_ENOSPC;
+
+ if (prjflags ||
+ (XFS_IS_GQUOTA_ON(ip->i_mount) &&
+ ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
+ delblksgdq = gdqp;
+ if (delblks) {
+ ASSERT(ip->i_gdquot);
+ unresgdq = ip->i_gdquot;
+ }
+ }
+ }
+
+ if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+ delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
+ flags | blkflags | prjflags)))
+ return (error);
+
+ /*
+ * Do the delayed blks reservations/unreservations now. Since, these
+ * are done without the help of a transaction, if a reservation fails
+ * its previous reservations won't be automatically undone by trans
+ * code. So, we have to do it manually here.
+ */
+ if (delblks) {
+ /*
+ * Do the reservations first. Unreservation can't fail.
+ */
+ ASSERT(delblksudq || delblksgdq);
+ ASSERT(unresudq || unresgdq);
+ if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+ delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
+ flags | blkflags | prjflags)))
+ return (error);
+ xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+ unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
+ blkflags);
+ }
+
+ return (0);
+}
+
+int
+xfs_qm_vop_rename_dqattach(
+ struct xfs_inode **i_tab)
+{
+ struct xfs_mount *mp = i_tab[0]->i_mount;
+ int i;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ return 0;
+
+ for (i = 0; (i < 4 && i_tab[i]); i++) {
+ struct xfs_inode *ip = i_tab[i];
+ int error;
+
+ /*
+ * Watch out for duplicate entries in the table.
+ */
+ if (i == 0 || ip != i_tab[i-1]) {
+ if (XFS_NOT_DQATTACHED(mp, ip)) {
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+ }
+ }
+ }
+ return 0;
+}
+
+void
+xfs_qm_vop_create_dqattach(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ return;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+ if (udqp) {
+ xfs_dqlock(udqp);
+ XFS_DQHOLD(udqp);
+ xfs_dqunlock(udqp);
+ ASSERT(ip->i_udquot == NULL);
+ ip->i_udquot = udqp;
+ ASSERT(XFS_IS_UQUOTA_ON(mp));
+ ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
+ xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
+ }
+ if (gdqp) {
+ xfs_dqlock(gdqp);
+ XFS_DQHOLD(gdqp);
+ xfs_dqunlock(gdqp);
+ ASSERT(ip->i_gdquot == NULL);
+ ip->i_gdquot = gdqp;
+ ASSERT(XFS_IS_OQUOTA_ON(mp));
+ ASSERT((XFS_IS_GQUOTA_ON(mp) ?
+ ip->i_d.di_gid : xfs_get_projid(ip)) ==
+ be32_to_cpu(gdqp->q_core.d_id));
+ xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
+ }
+}
+
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
new file mode 100644
index 00000000..567b29b9
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_QM_H__
+#define __XFS_QM_H__
+
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_quota_priv.h"
+#include "xfs_qm_stats.h"
+
+struct xfs_qm;
+struct xfs_inode;
+
+extern uint ndquot;
+extern struct mutex xfs_Gqm_lock;
+extern struct xfs_qm *xfs_Gqm;
+extern kmem_zone_t *qm_dqzone;
+extern kmem_zone_t *qm_dqtrxzone;
+
+/*
+ * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
+ * iterate over the mountpt's dquot list in one call.
+ */
+#define XFS_QM_SYNC_MAX_RESTARTS 7
+
+/*
+ * Ditto, for xfs_qm_dqreclaim_one.
+ */
+#define XFS_QM_RECLAIM_MAX_RESTARTS 4
+
+/*
+ * Ideal ratio of free to in use dquots. Quota manager makes an attempt
+ * to keep this balance.
+ */
+#define XFS_QM_DQFREE_RATIO 2
+
+/*
+ * Dquot hashtable constants/threshold values.
+ */
+#define XFS_QM_HASHSIZE_LOW (PAGE_SIZE / sizeof(xfs_dqhash_t))
+#define XFS_QM_HASHSIZE_HIGH ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t))
+
+/*
+ * This defines the unit of allocation of dquots.
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ * XXXsup However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
+
+typedef xfs_dqhash_t xfs_dqlist_t;
+
+/*
+ * Quota Manager (global) structure. Lives only in core.
+ */
+typedef struct xfs_qm {
+ xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */
+ xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */
+ uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */
+ struct list_head qm_dqfrlist; /* freelist of dquots */
+ struct mutex qm_dqfrlist_lock;
+ int qm_dqfrlist_cnt;
+ atomic_t qm_totaldquots; /* total incore dquots */
+ uint qm_nrefs; /* file systems with quota on */
+ int qm_dqfree_ratio;/* ratio of free to inuse dquots */
+ kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */
+ kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */
+} xfs_qm_t;
+
+/*
+ * Various quota information for individual filesystems.
+ * The mount structure keeps a pointer to this.
+ */
+typedef struct xfs_quotainfo {
+ xfs_inode_t *qi_uquotaip; /* user quota inode */
+ xfs_inode_t *qi_gquotaip; /* group quota inode */
+ struct list_head qi_dqlist; /* all dquots in filesys */
+ struct mutex qi_dqlist_lock;
+ int qi_dquots;
+ int qi_dqreclaims; /* a change here indicates
+ a removal in the dqlist */
+ time_t qi_btimelimit; /* limit for blks timer */
+ time_t qi_itimelimit; /* limit for inodes timer */
+ time_t qi_rtbtimelimit;/* limit for rt blks timer */
+ xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */
+ xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */
+ xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */
+ struct mutex qi_quotaofflock;/* to serialize quotaoff */
+ xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
+ uint qi_dqperchunk; /* # ondisk dqs in above chunk */
+ xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */
+ xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */
+ xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */
+ xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */
+ xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */
+ xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */
+} xfs_quotainfo_t;
+
+
+extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
+extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
+ xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
+extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
+extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
+
+/*
+ * We keep the usr and grp dquots separately so that locking will be easier
+ * to do at commit time. All transactions that we know of at this point
+ * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
+ */
+#define XFS_QM_TRANS_MAXDQS 2
+typedef struct xfs_dquot_acct {
+ xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
+ xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
+} xfs_dquot_acct_t;
+
+/*
+ * Users are allowed to have a usage exceeding their softlimit for
+ * a period this long.
+ */
+#define XFS_QM_BTIMELIMIT (7 * 24*60*60) /* 1 week */
+#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */
+#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */
+
+#define XFS_QM_BWARNLIMIT 5
+#define XFS_QM_IWARNLIMIT 5
+#define XFS_QM_RTBWARNLIMIT 5
+
+extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
+extern int xfs_qm_quotacheck(xfs_mount_t *);
+extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+
+/* dquot stuff */
+extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **);
+extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
+extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+
+/* quota ops */
+extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
+extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
+ fs_disk_quota_t *);
+extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
+ fs_disk_quota_t *);
+extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
+extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint);
+extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+
+#ifdef DEBUG
+extern int xfs_qm_internalqcheck(xfs_mount_t *);
+#else
+#define xfs_qm_internalqcheck(mp) (0)
+#endif
+
+#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
new file mode 100644
index 00000000..a0a829ad
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_qm.h"
+
+
+STATIC void
+xfs_fill_statvfs_from_dquot(
+ struct kstatfs *statp,
+ xfs_disk_dquot_t *dp)
+{
+ __uint64_t limit;
+
+ limit = dp->d_blk_softlimit ?
+ be64_to_cpu(dp->d_blk_softlimit) :
+ be64_to_cpu(dp->d_blk_hardlimit);
+ if (limit && statp->f_blocks > limit) {
+ statp->f_blocks = limit;
+ statp->f_bfree = statp->f_bavail =
+ (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
+ (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
+ }
+
+ limit = dp->d_ino_softlimit ?
+ be64_to_cpu(dp->d_ino_softlimit) :
+ be64_to_cpu(dp->d_ino_hardlimit);
+ if (limit && statp->f_files > limit) {
+ statp->f_files = limit;
+ statp->f_ffree =
+ (statp->f_files > be64_to_cpu(dp->d_icount)) ?
+ (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0;
+ }
+}
+
+
+/*
+ * Directory tree accounting is implemented using project quotas, where
+ * the project identifier is inherited from parent directories.
+ * A statvfs (df, etc.) of a directory that is using project quota should
+ * return a statvfs of the project, not the entire filesystem.
+ * This makes such trees appear as if they are filesystems in themselves.
+ */
+void
+xfs_qm_statvfs(
+ xfs_inode_t *ip,
+ struct kstatfs *statp)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_dquot_t *dqp;
+
+ if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
+ xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
+ xfs_qm_dqput(dqp);
+ }
+}
+
+int
+xfs_qm_newmount(
+ xfs_mount_t *mp,
+ uint *needquotamount,
+ uint *quotaflags)
+{
+ uint quotaondisk;
+ uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
+
+ quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
+ (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
+
+ if (quotaondisk) {
+ uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT;
+ pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT;
+ gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT;
+ }
+
+ /*
+ * If the device itself is read-only, we can't allow
+ * the user to change the state of quota on the mount -
+ * this would generate a transaction on the ro device,
+ * which would lead to an I/O error and shutdown
+ */
+
+ if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
+ (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) ||
+ (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
+ (!pquotaondisk && XFS_IS_PQUOTA_ON(mp)) ||
+ (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
+ (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) &&
+ xfs_dev_is_read_only(mp, "changing quota state")) {
+ xfs_warn(mp, "please mount with%s%s%s%s.",
+ (!quotaondisk ? "out quota" : ""),
+ (uquotaondisk ? " usrquota" : ""),
+ (pquotaondisk ? " prjquota" : ""),
+ (gquotaondisk ? " grpquota" : ""));
+ return XFS_ERROR(EPERM);
+ }
+
+ if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
+ /*
+ * Call mount_quotas at this point only if we won't have to do
+ * a quotacheck.
+ */
+ if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
+ /*
+ * If an error occurred, qm_mount_quotas code
+ * has already disabled quotas. So, just finish
+ * mounting, and get on with the boring life
+ * without disk quotas.
+ */
+ xfs_qm_mount_quotas(mp);
+ } else {
+ /*
+ * Clear the quota flags, but remember them. This
+ * is so that the quota code doesn't get invoked
+ * before we're ready. This can happen when an
+ * inode goes inactive and wants to free blocks,
+ * or via xfs_log_mount_finish.
+ */
+ *needquotamount = B_TRUE;
+ *quotaflags = mp->m_qflags;
+ mp->m_qflags = 0;
+ }
+ }
+
+ return 0;
+}
+
+void __init
+xfs_qm_init(void)
+{
+ printk(KERN_INFO "SGI XFS Quota Management subsystem\n");
+ mutex_init(&xfs_Gqm_lock);
+ xfs_qm_init_procfs();
+}
+
+void __exit
+xfs_qm_exit(void)
+{
+ xfs_qm_cleanup_procfs();
+ if (qm_dqzone)
+ kmem_zone_destroy(qm_dqzone);
+ if (qm_dqtrxzone)
+ kmem_zone_destroy(qm_dqtrxzone);
+}
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
new file mode 100644
index 00000000..8671a0b3
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_qm.h"
+
+struct xqmstats xqmstats;
+
+static int xqm_proc_show(struct seq_file *m, void *v)
+{
+ /* maximum; incore; ratio free to inuse; freelist */
+ seq_printf(m, "%d\t%d\t%d\t%u\n",
+ ndquot,
+ xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
+ xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
+ xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
+ return 0;
+}
+
+static int xqm_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqm_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
+ /* quota performance statistics */
+ seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
+ xqmstats.xs_qm_dqreclaims,
+ xqmstats.xs_qm_dqreclaim_misses,
+ xqmstats.xs_qm_dquot_dups,
+ xqmstats.xs_qm_dqcachemisses,
+ xqmstats.xs_qm_dqcachehits,
+ xqmstats.xs_qm_dqwants,
+ xqmstats.xs_qm_dqshake_reclaims,
+ xqmstats.xs_qm_dqinact_reclaims);
+ return 0;
+}
+
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xqmstat_proc_show, NULL);
+}
+
+static const struct file_operations xqmstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqmstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+void
+xfs_qm_init_procfs(void)
+{
+ proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
+ proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
+}
+
+void
+xfs_qm_cleanup_procfs(void)
+{
+ remove_proc_entry("fs/xfs/xqm", NULL);
+ remove_proc_entry("fs/xfs/xqmstat", NULL);
+}
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
new file mode 100644
index 00000000..5b964fc0
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_stats.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2002 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_QM_STATS_H__
+#define __XFS_QM_STATS_H__
+
+#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
+
+/*
+ * XQM global statistics
+ */
+struct xqmstats {
+ __uint32_t xs_qm_dqreclaims;
+ __uint32_t xs_qm_dqreclaim_misses;
+ __uint32_t xs_qm_dquot_dups;
+ __uint32_t xs_qm_dqcachemisses;
+ __uint32_t xs_qm_dqcachehits;
+ __uint32_t xs_qm_dqwants;
+ __uint32_t xs_qm_dqshake_reclaims;
+ __uint32_t xs_qm_dqinact_reclaims;
+};
+
+extern struct xqmstats xqmstats;
+
+# define XQM_STATS_INC(count) ( (count)++ )
+
+extern void xfs_qm_init_procfs(void);
+extern void xfs_qm_cleanup_procfs(void);
+
+#else
+
+# define XQM_STATS_INC(count) do { } while (0)
+
+static inline void xfs_qm_init_procfs(void) { };
+static inline void xfs_qm_cleanup_procfs(void) { };
+
+#endif
+
+#endif /* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
new file mode 100644
index 00000000..2dadb15d
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -0,0 +1,1259 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <linux/capability.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+
+STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
+STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
+ uint);
+STATIC uint xfs_qm_export_flags(uint);
+STATIC uint xfs_qm_export_qtype_flags(uint);
+STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
+ fs_disk_quota_t *);
+
+
+/*
+ * Turn off quota accounting and/or enforcement for all udquots and/or
+ * gdquots. Called only at unmount time.
+ *
+ * This assumes that there are no dquots of this file system cached
+ * incore, and modifies the ondisk dquot directly. Therefore, for example,
+ * it is an error to call this twice, without purging the cache.
+ */
+int
+xfs_qm_scall_quotaoff(
+ xfs_mount_t *mp,
+ uint flags)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ uint dqtype;
+ int error;
+ uint inactivate_flags;
+ xfs_qoff_logitem_t *qoffstart;
+ int nculprits;
+
+ /*
+ * No file system can have quotas enabled on disk but not in core.
+ * Note that quota utilities (like quotaoff) _expect_
+ * errno == EEXIST here.
+ */
+ if ((mp->m_qflags & flags) == 0)
+ return XFS_ERROR(EEXIST);
+ error = 0;
+
+ flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+
+ /*
+ * We don't want to deal with two quotaoffs messing up each other,
+ * so we're going to serialize it. quotaoff isn't exactly a performance
+ * critical thing.
+ * If quotaoff, then we must be dealing with the root filesystem.
+ */
+ ASSERT(q);
+ mutex_lock(&q->qi_quotaofflock);
+
+ /*
+ * If we're just turning off quota enforcement, change mp and go.
+ */
+ if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
+ mp->m_qflags &= ~(flags);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags = mp->m_qflags;
+ spin_unlock(&mp->m_sb_lock);
+ mutex_unlock(&q->qi_quotaofflock);
+
+ /* XXX what to do if error ? Revert back to old vals incore ? */
+ error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
+ return (error);
+ }
+
+ dqtype = 0;
+ inactivate_flags = 0;
+ /*
+ * If accounting is off, we must turn enforcement off, clear the
+ * quota 'CHKD' certificate to make it known that we have to
+ * do a quotacheck the next time this quota is turned on.
+ */
+ if (flags & XFS_UQUOTA_ACCT) {
+ dqtype |= XFS_QMOPT_UQUOTA;
+ flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
+ inactivate_flags |= XFS_UQUOTA_ACTIVE;
+ }
+ if (flags & XFS_GQUOTA_ACCT) {
+ dqtype |= XFS_QMOPT_GQUOTA;
+ flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+ inactivate_flags |= XFS_GQUOTA_ACTIVE;
+ } else if (flags & XFS_PQUOTA_ACCT) {
+ dqtype |= XFS_QMOPT_PQUOTA;
+ flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+ inactivate_flags |= XFS_PQUOTA_ACTIVE;
+ }
+
+ /*
+ * Nothing to do? Don't complain. This happens when we're just
+ * turning off quota enforcement.
+ */
+ if ((mp->m_qflags & flags) == 0)
+ goto out_unlock;
+
+ /*
+ * Write the LI_QUOTAOFF log record, and do SB changes atomically,
+ * and synchronously. If we fail to write, we should abort the
+ * operation as it cannot be recovered safely if we crash.
+ */
+ error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
+ * to take care of the race between dqget and quotaoff. We don't take
+ * any special locks to reset these bits. All processes need to check
+ * these bits *after* taking inode lock(s) to see if the particular
+ * quota type is in the process of being turned off. If *ACTIVE, it is
+ * guaranteed that all dquot structures and all quotainode ptrs will all
+ * stay valid as long as that inode is kept locked.
+ *
+ * There is no turning back after this.
+ */
+ mp->m_qflags &= ~inactivate_flags;
+
+ /*
+ * Give back all the dquot reference(s) held by inodes.
+ * Here we go thru every single incore inode in this file system, and
+ * do a dqrele on the i_udquot/i_gdquot that it may have.
+ * Essentially, as long as somebody has an inode locked, this guarantees
+ * that quotas will not be turned off. This is handy because in a
+ * transaction once we lock the inode(s) and check for quotaon, we can
+ * depend on the quota inodes (and other things) being valid as long as
+ * we keep the lock(s).
+ */
+ xfs_qm_dqrele_all_inodes(mp, flags);
+
+ /*
+ * Next we make the changes in the quota flag in the mount struct.
+ * This isn't protected by a particular lock directly, because we
+ * don't want to take a mrlock every time we depend on quotas being on.
+ */
+ mp->m_qflags &= ~(flags);
+
+ /*
+ * Go through all the dquots of this file system and purge them,
+ * according to what was turned off. We may not be able to get rid
+ * of all dquots, because dquots can have temporary references that
+ * are not attached to inodes. eg. xfs_setattr, xfs_create.
+ * So, if we couldn't purge all the dquots from the filesystem,
+ * we can't get rid of the incore data structures.
+ */
+ while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
+ delay(10 * nculprits);
+
+ /*
+ * Transactions that had started before ACTIVE state bit was cleared
+ * could have logged many dquots, so they'd have higher LSNs than
+ * the first QUOTAOFF log record does. If we happen to crash when
+ * the tail of the log has gone past the QUOTAOFF record, but
+ * before the last dquot modification, those dquots __will__
+ * recover, and that's not good.
+ *
+ * So, we have QUOTAOFF start and end logitems; the start
+ * logitem won't get overwritten until the end logitem appears...
+ */
+ error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+ if (error) {
+ /* We're screwed now. Shutdown is the only option. */
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ goto out_unlock;
+ }
+
+ /*
+ * If quotas is completely disabled, close shop.
+ */
+ if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
+ ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
+ mutex_unlock(&q->qi_quotaofflock);
+ xfs_qm_destroy_quotainfo(mp);
+ return (0);
+ }
+
+ /*
+ * Release our quotainode references if we don't need them anymore.
+ */
+ if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
+ IRELE(q->qi_uquotaip);
+ q->qi_uquotaip = NULL;
+ }
+ if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
+ IRELE(q->qi_gquotaip);
+ q->qi_gquotaip = NULL;
+ }
+
+out_unlock:
+ mutex_unlock(&q->qi_quotaofflock);
+ return error;
+}
+
+STATIC int
+xfs_qm_scall_trunc_qfile(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ struct xfs_inode *ip;
+ struct xfs_trans *tp;
+ int error;
+
+ if (ino == NULLFSINO)
+ return 0;
+
+ error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+ error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_ITRUNCATE_LOG_COUNT);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ goto out_put;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip);
+
+ error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, 1);
+ if (error) {
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+ XFS_TRANS_ABORT);
+ goto out_unlock;
+ }
+
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out_put:
+ IRELE(ip);
+ return error;
+}
+
+int
+xfs_qm_scall_trunc_qfiles(
+ xfs_mount_t *mp,
+ uint flags)
+{
+ int error = 0, error2 = 0;
+
+ if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
+ xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
+ __func__, flags, mp->m_qflags);
+ return XFS_ERROR(EINVAL);
+ }
+
+ if (flags & XFS_DQ_USER)
+ error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
+ if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
+ error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+
+ return error ? error : error2;
+}
+
+/*
+ * Switch on (a given) quota enforcement for a filesystem. This takes
+ * effect immediately.
+ * (Switching on quota accounting must be done at mount time.)
+ */
+int
+xfs_qm_scall_quotaon(
+ xfs_mount_t *mp,
+ uint flags)
+{
+ int error;
+ uint qf;
+ __int64_t sbflags;
+
+ flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+ /*
+ * Switching on quota accounting must be done at mount time.
+ */
+ flags &= ~(XFS_ALL_QUOTA_ACCT);
+
+ sbflags = 0;
+
+ if (flags == 0) {
+ xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
+ __func__, mp->m_qflags);
+ return XFS_ERROR(EINVAL);
+ }
+
+ /* No fs can turn on quotas with a delayed effect */
+ ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
+
+ /*
+ * Can't enforce without accounting. We check the superblock
+ * qflags here instead of m_qflags because rootfs can have
+ * quota acct on ondisk without m_qflags' knowing.
+ */
+ if (((flags & XFS_UQUOTA_ACCT) == 0 &&
+ (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+ (flags & XFS_UQUOTA_ENFD))
+ ||
+ ((flags & XFS_PQUOTA_ACCT) == 0 &&
+ (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+ (flags & XFS_GQUOTA_ACCT) == 0 &&
+ (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+ (flags & XFS_OQUOTA_ENFD))) {
+ xfs_debug(mp,
+ "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
+ __func__, flags, mp->m_sb.sb_qflags);
+ return XFS_ERROR(EINVAL);
+ }
+ /*
+ * If everything's up to-date incore, then don't waste time.
+ */
+ if ((mp->m_qflags & flags) == flags)
+ return XFS_ERROR(EEXIST);
+
+ /*
+ * Change sb_qflags on disk but not incore mp->qflags
+ * if this is the root filesystem.
+ */
+ spin_lock(&mp->m_sb_lock);
+ qf = mp->m_sb.sb_qflags;
+ mp->m_sb.sb_qflags = qf | flags;
+ spin_unlock(&mp->m_sb_lock);
+
+ /*
+ * There's nothing to change if it's the same.
+ */
+ if ((qf & flags) == flags && sbflags == 0)
+ return XFS_ERROR(EEXIST);
+ sbflags |= XFS_SB_QFLAGS;
+
+ if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
+ return (error);
+ /*
+ * If we aren't trying to switch on quota enforcement, we are done.
+ */
+ if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
+ (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
+ ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
+ (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
+ ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
+ (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
+ (flags & XFS_ALL_QUOTA_ENFD) == 0)
+ return (0);
+
+ if (! XFS_IS_QUOTA_RUNNING(mp))
+ return XFS_ERROR(ESRCH);
+
+ /*
+ * Switch on quota enforcement in core.
+ */
+ mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+ mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
+ mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
+
+ return (0);
+}
+
+
+/*
+ * Return quota status information, such as uquota-off, enforcements, etc.
+ */
+int
+xfs_qm_scall_getqstat(
+ struct xfs_mount *mp,
+ struct fs_quota_stat *out)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ struct xfs_inode *uip, *gip;
+ boolean_t tempuqip, tempgqip;
+
+ uip = gip = NULL;
+ tempuqip = tempgqip = B_FALSE;
+ memset(out, 0, sizeof(fs_quota_stat_t));
+
+ out->qs_version = FS_QSTAT_VERSION;
+ if (!xfs_sb_version_hasquota(&mp->m_sb)) {
+ out->qs_uquota.qfs_ino = NULLFSINO;
+ out->qs_gquota.qfs_ino = NULLFSINO;
+ return (0);
+ }
+ out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
+ (XFS_ALL_QUOTA_ACCT|
+ XFS_ALL_QUOTA_ENFD));
+ out->qs_pad = 0;
+ out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
+ out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+
+ if (q) {
+ uip = q->qi_uquotaip;
+ gip = q->qi_gquotaip;
+ }
+ if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
+ if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+ 0, 0, &uip) == 0)
+ tempuqip = B_TRUE;
+ }
+ if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
+ if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+ 0, 0, &gip) == 0)
+ tempgqip = B_TRUE;
+ }
+ if (uip) {
+ out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
+ out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
+ if (tempuqip)
+ IRELE(uip);
+ }
+ if (gip) {
+ out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
+ out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
+ if (tempgqip)
+ IRELE(gip);
+ }
+ if (q) {
+ out->qs_incoredqs = q->qi_dquots;
+ out->qs_btimelimit = q->qi_btimelimit;
+ out->qs_itimelimit = q->qi_itimelimit;
+ out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+ out->qs_bwarnlimit = q->qi_bwarnlimit;
+ out->qs_iwarnlimit = q->qi_iwarnlimit;
+ }
+ return 0;
+}
+
+#define XFS_DQ_MASK \
+ (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
+
+/*
+ * Adjust quota limits, and start/stop timers accordingly.
+ */
+int
+xfs_qm_scall_setqlim(
+ xfs_mount_t *mp,
+ xfs_dqid_t id,
+ uint type,
+ fs_disk_quota_t *newlim)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ xfs_disk_dquot_t *ddq;
+ xfs_dquot_t *dqp;
+ xfs_trans_t *tp;
+ int error;
+ xfs_qcnt_t hard, soft;
+
+ if (newlim->d_fieldmask & ~XFS_DQ_MASK)
+ return EINVAL;
+ if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+ return 0;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+ if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
+ 0, 0, XFS_DEFAULT_LOG_COUNT))) {
+ xfs_trans_cancel(tp, 0);
+ return (error);
+ }
+
+ /*
+ * We don't want to race with a quotaoff so take the quotaoff lock.
+ * (We don't hold an inode lock, so there's nothing else to stop
+ * a quotaoff from happening). (XXXThis doesn't currently happen
+ * because we take the vfslock before calling xfs_qm_sysent).
+ */
+ mutex_lock(&q->qi_quotaofflock);
+
+ /*
+ * Get the dquot (locked), and join it to the transaction.
+ * Allocate the dquot if this doesn't exist.
+ */
+ if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
+ xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ ASSERT(error != ENOENT);
+ goto out_unlock;
+ }
+ xfs_trans_dqjoin(tp, dqp);
+ ddq = &dqp->q_core;
+
+ /*
+ * Make sure that hardlimits are >= soft limits before changing.
+ */
+ hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
+ (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
+ be64_to_cpu(ddq->d_blk_hardlimit);
+ soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
+ (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
+ be64_to_cpu(ddq->d_blk_softlimit);
+ if (hard == 0 || hard >= soft) {
+ ddq->d_blk_hardlimit = cpu_to_be64(hard);
+ ddq->d_blk_softlimit = cpu_to_be64(soft);
+ if (id == 0) {
+ q->qi_bhardlimit = hard;
+ q->qi_bsoftlimit = soft;
+ }
+ } else {
+ xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
+ }
+ hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
+ (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
+ be64_to_cpu(ddq->d_rtb_hardlimit);
+ soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
+ (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
+ be64_to_cpu(ddq->d_rtb_softlimit);
+ if (hard == 0 || hard >= soft) {
+ ddq->d_rtb_hardlimit = cpu_to_be64(hard);
+ ddq->d_rtb_softlimit = cpu_to_be64(soft);
+ if (id == 0) {
+ q->qi_rtbhardlimit = hard;
+ q->qi_rtbsoftlimit = soft;
+ }
+ } else {
+ xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+ }
+
+ hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
+ (xfs_qcnt_t) newlim->d_ino_hardlimit :
+ be64_to_cpu(ddq->d_ino_hardlimit);
+ soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
+ (xfs_qcnt_t) newlim->d_ino_softlimit :
+ be64_to_cpu(ddq->d_ino_softlimit);
+ if (hard == 0 || hard >= soft) {
+ ddq->d_ino_hardlimit = cpu_to_be64(hard);
+ ddq->d_ino_softlimit = cpu_to_be64(soft);
+ if (id == 0) {
+ q->qi_ihardlimit = hard;
+ q->qi_isoftlimit = soft;
+ }
+ } else {
+ xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
+ }
+
+ /*
+ * Update warnings counter(s) if requested
+ */
+ if (newlim->d_fieldmask & FS_DQ_BWARNS)
+ ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns);
+ if (newlim->d_fieldmask & FS_DQ_IWARNS)
+ ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns);
+ if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
+ ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns);
+
+ if (id == 0) {
+ /*
+ * Timelimits for the super user set the relative time
+ * the other users can be over quota for this file system.
+ * If it is zero a default is used. Ditto for the default
+ * soft and hard limit values (already done, above), and
+ * for warnings.
+ */
+ if (newlim->d_fieldmask & FS_DQ_BTIMER) {
+ q->qi_btimelimit = newlim->d_btimer;
+ ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
+ }
+ if (newlim->d_fieldmask & FS_DQ_ITIMER) {
+ q->qi_itimelimit = newlim->d_itimer;
+ ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
+ }
+ if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
+ q->qi_rtbtimelimit = newlim->d_rtbtimer;
+ ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
+ }
+ if (newlim->d_fieldmask & FS_DQ_BWARNS)
+ q->qi_bwarnlimit = newlim->d_bwarns;
+ if (newlim->d_fieldmask & FS_DQ_IWARNS)
+ q->qi_iwarnlimit = newlim->d_iwarns;
+ if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
+ q->qi_rtbwarnlimit = newlim->d_rtbwarns;
+ } else {
+ /*
+ * If the user is now over quota, start the timelimit.
+ * The user will not be 'warned'.
+ * Note that we keep the timers ticking, whether enforcement
+ * is on or off. We don't really want to bother with iterating
+ * over all ondisk dquots and turning the timers on/off.
+ */
+ xfs_qm_adjust_dqtimers(mp, ddq);
+ }
+ dqp->dq_flags |= XFS_DQ_DIRTY;
+ xfs_trans_log_dquot(tp, dqp);
+
+ error = xfs_trans_commit(tp, 0);
+ xfs_qm_dqprint(dqp);
+ xfs_qm_dqrele(dqp);
+
+ out_unlock:
+ mutex_unlock(&q->qi_quotaofflock);
+ return error;
+}
+
+int
+xfs_qm_scall_getquota(
+ xfs_mount_t *mp,
+ xfs_dqid_t id,
+ uint type,
+ fs_disk_quota_t *out)
+{
+ xfs_dquot_t *dqp;
+ int error;
+
+ /*
+ * Try to get the dquot. We don't want it allocated on disk, so
+ * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+ * exist, we'll get ENOENT back.
+ */
+ if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
+ return (error);
+ }
+
+ /*
+ * If everything's NULL, this dquot doesn't quite exist as far as
+ * our utility programs are concerned.
+ */
+ if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+ xfs_qm_dqput(dqp);
+ return XFS_ERROR(ENOENT);
+ }
+ /* xfs_qm_dqprint(dqp); */
+ /*
+ * Convert the disk dquot to the exportable format
+ */
+ xfs_qm_export_dquot(mp, &dqp->q_core, out);
+ xfs_qm_dqput(dqp);
+ return (error ? XFS_ERROR(EFAULT) : 0);
+}
+
+
+STATIC int
+xfs_qm_log_quotaoff_end(
+ xfs_mount_t *mp,
+ xfs_qoff_logitem_t *startqoff,
+ uint flags)
+{
+ xfs_trans_t *tp;
+ int error;
+ xfs_qoff_logitem_t *qoffi;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
+
+ if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
+ 0, 0, XFS_DEFAULT_LOG_COUNT))) {
+ xfs_trans_cancel(tp, 0);
+ return (error);
+ }
+
+ qoffi = xfs_trans_get_qoff_item(tp, startqoff,
+ flags & XFS_ALL_QUOTA_ACCT);
+ xfs_trans_log_quotaoff_item(tp, qoffi);
+
+ /*
+ * We have to make sure that the transaction is secure on disk before we
+ * return and actually stop quota accounting. So, make it synchronous.
+ * We don't care about quotoff's performance.
+ */
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp, 0);
+ return (error);
+}
+
+
+STATIC int
+xfs_qm_log_quotaoff(
+ xfs_mount_t *mp,
+ xfs_qoff_logitem_t **qoffstartp,
+ uint flags)
+{
+ xfs_trans_t *tp;
+ int error;
+ xfs_qoff_logitem_t *qoffi=NULL;
+ uint oldsbqflag=0;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
+ if ((error = xfs_trans_reserve(tp, 0,
+ sizeof(xfs_qoff_logitem_t) * 2 +
+ mp->m_sb.sb_sectsize + 128,
+ 0,
+ 0,
+ XFS_DEFAULT_LOG_COUNT))) {
+ goto error0;
+ }
+
+ qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
+ xfs_trans_log_quotaoff_item(tp, qoffi);
+
+ spin_lock(&mp->m_sb_lock);
+ oldsbqflag = mp->m_sb.sb_qflags;
+ mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
+ spin_unlock(&mp->m_sb_lock);
+
+ xfs_mod_sb(tp, XFS_SB_QFLAGS);
+
+ /*
+ * We have to make sure that the transaction is secure on disk before we
+ * return and actually stop quota accounting. So, make it synchronous.
+ * We don't care about quotoff's performance.
+ */
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp, 0);
+
+error0:
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ /*
+ * No one else is modifying sb_qflags, so this is OK.
+ * We still hold the quotaofflock.
+ */
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags = oldsbqflag;
+ spin_unlock(&mp->m_sb_lock);
+ }
+ *qoffstartp = qoffi;
+ return (error);
+}
+
+
+/*
+ * Translate an internal style on-disk-dquot to the exportable format.
+ * The main differences are that the counters/limits are all in Basic
+ * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
+ * to be converted to the native endianness.
+ */
+STATIC void
+xfs_qm_export_dquot(
+ xfs_mount_t *mp,
+ xfs_disk_dquot_t *src,
+ struct fs_disk_quota *dst)
+{
+ memset(dst, 0, sizeof(*dst));
+ dst->d_version = FS_DQUOT_VERSION; /* different from src->d_version */
+ dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags);
+ dst->d_id = be32_to_cpu(src->d_id);
+ dst->d_blk_hardlimit =
+ XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit));
+ dst->d_blk_softlimit =
+ XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit));
+ dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit);
+ dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit);
+ dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount));
+ dst->d_icount = be64_to_cpu(src->d_icount);
+ dst->d_btimer = be32_to_cpu(src->d_btimer);
+ dst->d_itimer = be32_to_cpu(src->d_itimer);
+ dst->d_iwarns = be16_to_cpu(src->d_iwarns);
+ dst->d_bwarns = be16_to_cpu(src->d_bwarns);
+ dst->d_rtb_hardlimit =
+ XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit));
+ dst->d_rtb_softlimit =
+ XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit));
+ dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount));
+ dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer);
+ dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns);
+
+ /*
+ * Internally, we don't reset all the timers when quota enforcement
+ * gets turned off. No need to confuse the user level code,
+ * so return zeroes in that case.
+ */
+ if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) ||
+ (!XFS_IS_OQUOTA_ENFORCED(mp) &&
+ (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+ dst->d_btimer = 0;
+ dst->d_itimer = 0;
+ dst->d_rtbtimer = 0;
+ }
+
+#ifdef DEBUG
+ if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
+ (XFS_IS_OQUOTA_ENFORCED(mp) &&
+ (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
+ dst->d_id != 0) {
+ if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
+ (dst->d_blk_softlimit > 0)) {
+ ASSERT(dst->d_btimer != 0);
+ }
+ if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
+ (dst->d_ino_softlimit > 0)) {
+ ASSERT(dst->d_itimer != 0);
+ }
+ }
+#endif
+}
+
+STATIC uint
+xfs_qm_export_qtype_flags(
+ uint flags)
+{
+ /*
+ * Can't be more than one, or none.
+ */
+ ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
+ (FS_PROJ_QUOTA | FS_USER_QUOTA));
+ ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
+ (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
+ ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
+ (FS_USER_QUOTA | FS_GROUP_QUOTA));
+ ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
+
+ return (flags & XFS_DQ_USER) ?
+ FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
+ FS_PROJ_QUOTA : FS_GROUP_QUOTA;
+}
+
+STATIC uint
+xfs_qm_export_flags(
+ uint flags)
+{
+ uint uflags;
+
+ uflags = 0;
+ if (flags & XFS_UQUOTA_ACCT)
+ uflags |= FS_QUOTA_UDQ_ACCT;
+ if (flags & XFS_PQUOTA_ACCT)
+ uflags |= FS_QUOTA_PDQ_ACCT;
+ if (flags & XFS_GQUOTA_ACCT)
+ uflags |= FS_QUOTA_GDQ_ACCT;
+ if (flags & XFS_UQUOTA_ENFD)
+ uflags |= FS_QUOTA_UDQ_ENFD;
+ if (flags & (XFS_OQUOTA_ENFD)) {
+ uflags |= (flags & XFS_GQUOTA_ACCT) ?
+ FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
+ }
+ return (uflags);
+}
+
+
+STATIC int
+xfs_dqrele_inode(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ int flags)
+{
+ /* skip quota inodes */
+ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
+ ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
+ ASSERT(ip->i_udquot == NULL);
+ ASSERT(ip->i_gdquot == NULL);
+ return 0;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
+ xfs_qm_dqrele(ip->i_udquot);
+ ip->i_udquot = NULL;
+ }
+ if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+ xfs_qm_dqrele(ip->i_gdquot);
+ ip->i_gdquot = NULL;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+
+/*
+ * Go thru all the inodes in the file system, releasing their dquots.
+ *
+ * Note that the mount structure gets modified to indicate that quotas are off
+ * AFTER this, in the case of quotaoff.
+ */
+void
+xfs_qm_dqrele_all_inodes(
+ struct xfs_mount *mp,
+ uint flags)
+{
+ ASSERT(mp->m_quotainfo);
+ xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
+}
+
+/*------------------------------------------------------------------------*/
+#ifdef DEBUG
+/*
+ * This contains all the test functions for XFS disk quotas.
+ * Currently it does a quota accounting check. ie. it walks through
+ * all inodes in the file system, calculating the dquot accounting fields,
+ * and prints out any inconsistencies.
+ */
+xfs_dqhash_t *qmtest_udqtab;
+xfs_dqhash_t *qmtest_gdqtab;
+int qmtest_hashmask;
+int qmtest_nfails;
+struct mutex qcheck_lock;
+
+#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
+ (__psunsigned_t)(id)) & \
+ (qmtest_hashmask - 1))
+
+#define DQTEST_HASH(mp, id, type) ((type & XFS_DQ_USER) ? \
+ (qmtest_udqtab + \
+ DQTEST_HASHVAL(mp, id)) : \
+ (qmtest_gdqtab + \
+ DQTEST_HASHVAL(mp, id)))
+
+#define DQTEST_LIST_PRINT(l, NXT, title) \
+{ \
+ xfs_dqtest_t *dqp; int i = 0;\
+ xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \
+ for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
+ dqp = (xfs_dqtest_t *)dqp->NXT) { \
+ xfs_debug(dqp->q_mount, \
+ " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \
+ ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \
+ dqp->d_bcount, dqp->d_icount); } \
+}
+
+typedef struct dqtest {
+ uint dq_flags; /* various flags (XFS_DQ_*) */
+ struct list_head q_hashlist;
+ xfs_dqhash_t *q_hash; /* the hashchain header */
+ xfs_mount_t *q_mount; /* filesystem this relates to */
+ xfs_dqid_t d_id; /* user id or group id */
+ xfs_qcnt_t d_bcount; /* # disk blocks owned by the user */
+ xfs_qcnt_t d_icount; /* # inodes owned by the user */
+} xfs_dqtest_t;
+
+STATIC void
+xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
+{
+ list_add(&dqp->q_hashlist, &h->qh_list);
+ h->qh_version++;
+ h->qh_nelems++;
+}
+STATIC void
+xfs_qm_dqtest_print(
+ struct xfs_mount *mp,
+ struct dqtest *d)
+{
+ xfs_debug(mp, "-----------DQTEST DQUOT----------------");
+ xfs_debug(mp, "---- dquot ID = %d", d->d_id);
+ xfs_debug(mp, "---- fs = 0x%p", d->q_mount);
+ xfs_debug(mp, "---- bcount = %Lu (0x%x)",
+ d->d_bcount, (int)d->d_bcount);
+ xfs_debug(mp, "---- icount = %Lu (0x%x)",
+ d->d_icount, (int)d->d_icount);
+ xfs_debug(mp, "---------------------------");
+}
+
+STATIC void
+xfs_qm_dqtest_failed(
+ xfs_dqtest_t *d,
+ xfs_dquot_t *dqp,
+ char *reason,
+ xfs_qcnt_t a,
+ xfs_qcnt_t b,
+ int error)
+{
+ qmtest_nfails++;
+ if (error)
+ xfs_debug(dqp->q_mount,
+ "quotacheck failed id=%d, err=%d\nreason: %s",
+ d->d_id, error, reason);
+ else
+ xfs_debug(dqp->q_mount,
+ "quotacheck failed id=%d (%s) [%d != %d]",
+ d->d_id, reason, (int)a, (int)b);
+ xfs_qm_dqtest_print(dqp->q_mount, d);
+ if (dqp)
+ xfs_qm_dqprint(dqp);
+}
+
+STATIC int
+xfs_dqtest_cmp2(
+ xfs_dqtest_t *d,
+ xfs_dquot_t *dqp)
+{
+ int err = 0;
+ if (be64_to_cpu(dqp->q_core.d_icount) != d->d_icount) {
+ xfs_qm_dqtest_failed(d, dqp, "icount mismatch",
+ be64_to_cpu(dqp->q_core.d_icount),
+ d->d_icount, 0);
+ err++;
+ }
+ if (be64_to_cpu(dqp->q_core.d_bcount) != d->d_bcount) {
+ xfs_qm_dqtest_failed(d, dqp, "bcount mismatch",
+ be64_to_cpu(dqp->q_core.d_bcount),
+ d->d_bcount, 0);
+ err++;
+ }
+ if (dqp->q_core.d_blk_softlimit &&
+ be64_to_cpu(dqp->q_core.d_bcount) >=
+ be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
+ if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
+ xfs_debug(dqp->q_mount,
+ "%d [%s] BLK TIMER NOT STARTED",
+ d->d_id, DQFLAGTO_TYPESTR(d));
+ err++;
+ }
+ }
+ if (dqp->q_core.d_ino_softlimit &&
+ be64_to_cpu(dqp->q_core.d_icount) >=
+ be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
+ if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
+ xfs_debug(dqp->q_mount,
+ "%d [%s] INO TIMER NOT STARTED",
+ d->d_id, DQFLAGTO_TYPESTR(d));
+ err++;
+ }
+ }
+#ifdef QUOTADEBUG
+ if (!err) {
+ xfs_debug(dqp->q_mount, "%d [%s] qchecked",
+ d->d_id, DQFLAGTO_TYPESTR(d));
+ }
+#endif
+ return (err);
+}
+
+STATIC void
+xfs_dqtest_cmp(
+ xfs_dqtest_t *d)
+{
+ xfs_dquot_t *dqp;
+ int error;
+
+ /* xfs_qm_dqtest_print(d); */
+ if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0,
+ &dqp))) {
+ xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error);
+ return;
+ }
+ xfs_dqtest_cmp2(d, dqp);
+ xfs_qm_dqput(dqp);
+}
+
+STATIC int
+xfs_qm_internalqcheck_dqget(
+ xfs_mount_t *mp,
+ xfs_dqid_t id,
+ uint type,
+ xfs_dqtest_t **O_dq)
+{
+ xfs_dqtest_t *d;
+ xfs_dqhash_t *h;
+
+ h = DQTEST_HASH(mp, id, type);
+ list_for_each_entry(d, &h->qh_list, q_hashlist) {
+ if (d->d_id == id && mp == d->q_mount) {
+ *O_dq = d;
+ return (0);
+ }
+ }
+ d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP);
+ d->dq_flags = type;
+ d->d_id = id;
+ d->q_mount = mp;
+ d->q_hash = h;
+ INIT_LIST_HEAD(&d->q_hashlist);
+ xfs_qm_hashinsert(h, d);
+ *O_dq = d;
+ return (0);
+}
+
+STATIC void
+xfs_qm_internalqcheck_get_dquots(
+ xfs_mount_t *mp,
+ xfs_dqid_t uid,
+ xfs_dqid_t projid,
+ xfs_dqid_t gid,
+ xfs_dqtest_t **ud,
+ xfs_dqtest_t **gd)
+{
+ if (XFS_IS_UQUOTA_ON(mp))
+ xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud);
+ if (XFS_IS_GQUOTA_ON(mp))
+ xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd);
+ else if (XFS_IS_PQUOTA_ON(mp))
+ xfs_qm_internalqcheck_dqget(mp, projid, XFS_DQ_PROJ, gd);
+}
+
+
+STATIC void
+xfs_qm_internalqcheck_dqadjust(
+ xfs_inode_t *ip,
+ xfs_dqtest_t *d)
+{
+ d->d_icount++;
+ d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks;
+}
+
+STATIC int
+xfs_qm_internalqcheck_adjust(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t ino, /* inode number to get data for */
+ void __user *buffer, /* not used */
+ int ubsize, /* not used */
+ int *ubused, /* not used */
+ int *res) /* bulkstat result code */
+{
+ xfs_inode_t *ip;
+ xfs_dqtest_t *ud, *gd;
+ uint lock_flags;
+ boolean_t ipreleased;
+ int error;
+
+ ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+ if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+ *res = BULKSTAT_RV_NOTHING;
+ xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n",
+ __func__, (unsigned long long) ino,
+ (unsigned long long) mp->m_sb.sb_uquotino,
+ (unsigned long long) mp->m_sb.sb_gquotino);
+ return XFS_ERROR(EINVAL);
+ }
+ ipreleased = B_FALSE;
+ again:
+ lock_flags = XFS_ILOCK_SHARED;
+ if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip))) {
+ *res = BULKSTAT_RV_NOTHING;
+ return (error);
+ }
+
+ /*
+ * This inode can have blocks after eof which can get released
+ * when we send it to inactive. Since we don't check the dquot
+ * until the after all our calculations are done, we must get rid
+ * of those now.
+ */
+ if (! ipreleased) {
+ xfs_iunlock(ip, lock_flags);
+ IRELE(ip);
+ ipreleased = B_TRUE;
+ goto again;
+ }
+ xfs_qm_internalqcheck_get_dquots(mp,
+ (xfs_dqid_t) ip->i_d.di_uid,
+ (xfs_dqid_t) xfs_get_projid(ip),
+ (xfs_dqid_t) ip->i_d.di_gid,
+ &ud, &gd);
+ if (XFS_IS_UQUOTA_ON(mp)) {
+ ASSERT(ud);
+ xfs_qm_internalqcheck_dqadjust(ip, ud);
+ }
+ if (XFS_IS_OQUOTA_ON(mp)) {
+ ASSERT(gd);
+ xfs_qm_internalqcheck_dqadjust(ip, gd);
+ }
+ xfs_iunlock(ip, lock_flags);
+ IRELE(ip);
+ *res = BULKSTAT_RV_DIDONE;
+ return (0);
+}
+
+
+/* PRIVATE, debugging */
+int
+xfs_qm_internalqcheck(
+ xfs_mount_t *mp)
+{
+ xfs_ino_t lastino;
+ int done, count;
+ int i;
+ int error;
+
+ lastino = 0;
+ qmtest_hashmask = 32;
+ count = 5;
+ done = 0;
+ qmtest_nfails = 0;
+
+ if (! XFS_IS_QUOTA_ON(mp))
+ return XFS_ERROR(ESRCH);
+
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ XFS_bflush(mp->m_ddev_targp);
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ XFS_bflush(mp->m_ddev_targp);
+
+ mutex_lock(&qcheck_lock);
+ /* There should be absolutely no quota activity while this
+ is going on. */
+ qmtest_udqtab = kmem_zalloc(qmtest_hashmask *
+ sizeof(xfs_dqhash_t), KM_SLEEP);
+ qmtest_gdqtab = kmem_zalloc(qmtest_hashmask *
+ sizeof(xfs_dqhash_t), KM_SLEEP);
+ do {
+ /*
+ * Iterate thru all the inodes in the file system,
+ * adjusting the corresponding dquot counters
+ */
+ error = xfs_bulkstat(mp, &lastino, &count,
+ xfs_qm_internalqcheck_adjust,
+ 0, NULL, &done);
+ if (error) {
+ xfs_debug(mp, "Bulkstat returned error 0x%x", error);
+ break;
+ }
+ } while (!done);
+
+ xfs_debug(mp, "Checking results against system dquots");
+ for (i = 0; i < qmtest_hashmask; i++) {
+ xfs_dqtest_t *d, *n;
+ xfs_dqhash_t *h;
+
+ h = &qmtest_udqtab[i];
+ list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
+ xfs_dqtest_cmp(d);
+ kmem_free(d);
+ }
+ h = &qmtest_gdqtab[i];
+ list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
+ xfs_dqtest_cmp(d);
+ kmem_free(d);
+ }
+ }
+
+ if (qmtest_nfails) {
+ xfs_debug(mp, "******** quotacheck failed ********");
+ xfs_debug(mp, "failures = %d", qmtest_nfails);
+ } else {
+ xfs_debug(mp, "******** quotacheck successful! ********");
+ }
+ kmem_free(qmtest_udqtab);
+ kmem_free(qmtest_gdqtab);
+ mutex_unlock(&qcheck_lock);
+ return (qmtest_nfails);
+}
+
+#endif /* DEBUG */
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
new file mode 100644
index 00000000..94a3d927
--- /dev/null
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_QUOTA_PRIV_H__
+#define __XFS_QUOTA_PRIV_H__
+
+/*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE 10
+
+/*
+ * Hash into a bucket in the dquot hash table, based on <mp, id>.
+ */
+#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
+ (__psunsigned_t)(id)) & \
+ (xfs_Gqm->qm_dqhashmask - 1))
+#define XFS_DQ_HASH(mp, id, type) (type == XFS_DQ_USER ? \
+ (xfs_Gqm->qm_usr_dqhtable + \
+ XFS_DQ_HASHVAL(mp, id)) : \
+ (xfs_Gqm->qm_grp_dqhtable + \
+ XFS_DQ_HASHVAL(mp, id)))
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+ !dqp->q_core.d_blk_hardlimit && \
+ !dqp->q_core.d_blk_softlimit && \
+ !dqp->q_core.d_rtb_hardlimit && \
+ !dqp->q_core.d_rtb_softlimit && \
+ !dqp->q_core.d_ino_hardlimit && \
+ !dqp->q_core.d_ino_softlimit && \
+ !dqp->q_core.d_bcount && \
+ !dqp->q_core.d_rtbcount && \
+ !dqp->q_core.d_icount)
+
+#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
+ (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
+ (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
+
+#endif /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
new file mode 100644
index 00000000..2a364873
--- /dev/null
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -0,0 +1,895 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+
+STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
+
+/*
+ * Add the locked dquot to the transaction.
+ * The dquot must be locked, and it cannot be associated with any
+ * transaction.
+ */
+void
+xfs_trans_dqjoin(
+ xfs_trans_t *tp,
+ xfs_dquot_t *dqp)
+{
+ ASSERT(dqp->q_transp != tp);
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ ASSERT(dqp->q_logitem.qli_dquot == dqp);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
+
+ /*
+ * Initialize i_transp so we can later determine if this dquot is
+ * associated with this transaction.
+ */
+ dqp->q_transp = tp;
+}
+
+
+/*
+ * This is called to mark the dquot as needing
+ * to be logged when the transaction is committed. The dquot must
+ * already be associated with the given transaction.
+ * Note that it marks the entire transaction as dirty. In the ordinary
+ * case, this gets called via xfs_trans_commit, after the transaction
+ * is already dirty. However, there's nothing stop this from getting
+ * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY
+ * flag.
+ */
+void
+xfs_trans_log_dquot(
+ xfs_trans_t *tp,
+ xfs_dquot_t *dqp)
+{
+ ASSERT(dqp->q_transp == tp);
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
+
+/*
+ * Carry forward whatever is left of the quota blk reservation to
+ * the spanky new transaction
+ */
+void
+xfs_trans_dup_dqinfo(
+ xfs_trans_t *otp,
+ xfs_trans_t *ntp)
+{
+ xfs_dqtrx_t *oq, *nq;
+ int i,j;
+ xfs_dqtrx_t *oqa, *nqa;
+
+ if (!otp->t_dqinfo)
+ return;
+
+ xfs_trans_alloc_dqinfo(ntp);
+ oqa = otp->t_dqinfo->dqa_usrdquots;
+ nqa = ntp->t_dqinfo->dqa_usrdquots;
+
+ /*
+ * Because the quota blk reservation is carried forward,
+ * it is also necessary to carry forward the DQ_DIRTY flag.
+ */
+ if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+ ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
+
+ for (j = 0; j < 2; j++) {
+ for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+ if (oqa[i].qt_dquot == NULL)
+ break;
+ oq = &oqa[i];
+ nq = &nqa[i];
+
+ nq->qt_dquot = oq->qt_dquot;
+ nq->qt_bcount_delta = nq->qt_icount_delta = 0;
+ nq->qt_rtbcount_delta = 0;
+
+ /*
+ * Transfer whatever is left of the reservations.
+ */
+ nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
+ oq->qt_blk_res = oq->qt_blk_res_used;
+
+ nq->qt_rtblk_res = oq->qt_rtblk_res -
+ oq->qt_rtblk_res_used;
+ oq->qt_rtblk_res = oq->qt_rtblk_res_used;
+
+ nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used;
+ oq->qt_ino_res = oq->qt_ino_res_used;
+
+ }
+ oqa = otp->t_dqinfo->dqa_grpdquots;
+ nqa = ntp->t_dqinfo->dqa_grpdquots;
+ }
+}
+
+/*
+ * Wrap around mod_dquot to account for both user and group quotas.
+ */
+void
+xfs_trans_mod_dquot_byino(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip,
+ uint field,
+ long delta)
+{
+ xfs_mount_t *mp = tp->t_mountp;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) ||
+ !XFS_IS_QUOTA_ON(mp) ||
+ ip->i_ino == mp->m_sb.sb_uquotino ||
+ ip->i_ino == mp->m_sb.sb_gquotino)
+ return;
+
+ if (tp->t_dqinfo == NULL)
+ xfs_trans_alloc_dqinfo(tp);
+
+ if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
+ (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+ if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot)
+ (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+}
+
+STATIC xfs_dqtrx_t *
+xfs_trans_get_dqtrx(
+ xfs_trans_t *tp,
+ xfs_dquot_t *dqp)
+{
+ int i;
+ xfs_dqtrx_t *qa;
+
+ qa = XFS_QM_ISUDQ(dqp) ?
+ tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+
+ for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+ if (qa[i].qt_dquot == NULL ||
+ qa[i].qt_dquot == dqp)
+ return &qa[i];
+ }
+
+ return NULL;
+}
+
+/*
+ * Make the changes in the transaction structure.
+ * The moral equivalent to xfs_trans_mod_sb().
+ * We don't touch any fields in the dquot, so we don't care
+ * if it's locked or not (most of the time it won't be).
+ */
+void
+xfs_trans_mod_dquot(
+ xfs_trans_t *tp,
+ xfs_dquot_t *dqp,
+ uint field,
+ long delta)
+{
+ xfs_dqtrx_t *qtrx;
+
+ ASSERT(tp);
+ ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+ qtrx = NULL;
+
+ if (tp->t_dqinfo == NULL)
+ xfs_trans_alloc_dqinfo(tp);
+ /*
+ * Find either the first free slot or the slot that belongs
+ * to this dquot.
+ */
+ qtrx = xfs_trans_get_dqtrx(tp, dqp);
+ ASSERT(qtrx);
+ if (qtrx->qt_dquot == NULL)
+ qtrx->qt_dquot = dqp;
+
+ switch (field) {
+
+ /*
+ * regular disk blk reservation
+ */
+ case XFS_TRANS_DQ_RES_BLKS:
+ qtrx->qt_blk_res += (ulong)delta;
+ break;
+
+ /*
+ * inode reservation
+ */
+ case XFS_TRANS_DQ_RES_INOS:
+ qtrx->qt_ino_res += (ulong)delta;
+ break;
+
+ /*
+ * disk blocks used.
+ */
+ case XFS_TRANS_DQ_BCOUNT:
+ if (qtrx->qt_blk_res && delta > 0) {
+ qtrx->qt_blk_res_used += (ulong)delta;
+ ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
+ }
+ qtrx->qt_bcount_delta += delta;
+ break;
+
+ case XFS_TRANS_DQ_DELBCOUNT:
+ qtrx->qt_delbcnt_delta += delta;
+ break;
+
+ /*
+ * Inode Count
+ */
+ case XFS_TRANS_DQ_ICOUNT:
+ if (qtrx->qt_ino_res && delta > 0) {
+ qtrx->qt_ino_res_used += (ulong)delta;
+ ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+ }
+ qtrx->qt_icount_delta += delta;
+ break;
+
+ /*
+ * rtblk reservation
+ */
+ case XFS_TRANS_DQ_RES_RTBLKS:
+ qtrx->qt_rtblk_res += (ulong)delta;
+ break;
+
+ /*
+ * rtblk count
+ */
+ case XFS_TRANS_DQ_RTBCOUNT:
+ if (qtrx->qt_rtblk_res && delta > 0) {
+ qtrx->qt_rtblk_res_used += (ulong)delta;
+ ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
+ }
+ qtrx->qt_rtbcount_delta += delta;
+ break;
+
+ case XFS_TRANS_DQ_DELRTBCOUNT:
+ qtrx->qt_delrtb_delta += delta;
+ break;
+
+ default:
+ ASSERT(0);
+ }
+ tp->t_flags |= XFS_TRANS_DQ_DIRTY;
+}
+
+
+/*
+ * Given an array of dqtrx structures, lock all the dquots associated
+ * and join them to the transaction, provided they have been modified.
+ * We know that the highest number of dquots (of one type - usr OR grp),
+ * involved in a transaction is 2 and that both usr and grp combined - 3.
+ * So, we don't attempt to make this very generic.
+ */
+STATIC void
+xfs_trans_dqlockedjoin(
+ xfs_trans_t *tp,
+ xfs_dqtrx_t *q)
+{
+ ASSERT(q[0].qt_dquot != NULL);
+ if (q[1].qt_dquot == NULL) {
+ xfs_dqlock(q[0].qt_dquot);
+ xfs_trans_dqjoin(tp, q[0].qt_dquot);
+ } else {
+ ASSERT(XFS_QM_TRANS_MAXDQS == 2);
+ xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
+ xfs_trans_dqjoin(tp, q[0].qt_dquot);
+ xfs_trans_dqjoin(tp, q[1].qt_dquot);
+ }
+}
+
+
+/*
+ * Called by xfs_trans_commit() and similar in spirit to
+ * xfs_trans_apply_sb_deltas().
+ * Go thru all the dquots belonging to this transaction and modify the
+ * INCORE dquot to reflect the actual usages.
+ * Unreserve just the reservations done by this transaction.
+ * dquot is still left locked at exit.
+ */
+void
+xfs_trans_apply_dquot_deltas(
+ xfs_trans_t *tp)
+{
+ int i, j;
+ xfs_dquot_t *dqp;
+ xfs_dqtrx_t *qtrx, *qa;
+ xfs_disk_dquot_t *d;
+ long totalbdelta;
+ long totalrtbdelta;
+
+ if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
+ return;
+
+ ASSERT(tp->t_dqinfo);
+ qa = tp->t_dqinfo->dqa_usrdquots;
+ for (j = 0; j < 2; j++) {
+ if (qa[0].qt_dquot == NULL) {
+ qa = tp->t_dqinfo->dqa_grpdquots;
+ continue;
+ }
+
+ /*
+ * Lock all of the dquots and join them to the transaction.
+ */
+ xfs_trans_dqlockedjoin(tp, qa);
+
+ for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+ qtrx = &qa[i];
+ /*
+ * The array of dquots is filled
+ * sequentially, not sparsely.
+ */
+ if ((dqp = qtrx->qt_dquot) == NULL)
+ break;
+
+ ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ ASSERT(dqp->q_transp == tp);
+
+ /*
+ * adjust the actual number of blocks used
+ */
+ d = &dqp->q_core;
+
+ /*
+ * The issue here is - sometimes we don't make a blkquota
+ * reservation intentionally to be fair to users
+ * (when the amount is small). On the other hand,
+ * delayed allocs do make reservations, but that's
+ * outside of a transaction, so we have no
+ * idea how much was really reserved.
+ * So, here we've accumulated delayed allocation blks and
+ * non-delay blks. The assumption is that the
+ * delayed ones are always reserved (outside of a
+ * transaction), and the others may or may not have
+ * quota reservations.
+ */
+ totalbdelta = qtrx->qt_bcount_delta +
+ qtrx->qt_delbcnt_delta;
+ totalrtbdelta = qtrx->qt_rtbcount_delta +
+ qtrx->qt_delrtb_delta;
+#ifdef QUOTADEBUG
+ if (totalbdelta < 0)
+ ASSERT(be64_to_cpu(d->d_bcount) >=
+ (xfs_qcnt_t) -totalbdelta);
+
+ if (totalrtbdelta < 0)
+ ASSERT(be64_to_cpu(d->d_rtbcount) >=
+ (xfs_qcnt_t) -totalrtbdelta);
+
+ if (qtrx->qt_icount_delta < 0)
+ ASSERT(be64_to_cpu(d->d_icount) >=
+ (xfs_qcnt_t) -qtrx->qt_icount_delta);
+#endif
+ if (totalbdelta)
+ be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta);
+
+ if (qtrx->qt_icount_delta)
+ be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta);
+
+ if (totalrtbdelta)
+ be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta);
+
+ /*
+ * Get any default limits in use.
+ * Start/reset the timer(s) if needed.
+ */
+ if (d->d_id) {
+ xfs_qm_adjust_dqlimits(tp->t_mountp, d);
+ xfs_qm_adjust_dqtimers(tp->t_mountp, d);
+ }
+
+ dqp->dq_flags |= XFS_DQ_DIRTY;
+ /*
+ * add this to the list of items to get logged
+ */
+ xfs_trans_log_dquot(tp, dqp);
+ /*
+ * Take off what's left of the original reservation.
+ * In case of delayed allocations, there's no
+ * reservation that a transaction structure knows of.
+ */
+ if (qtrx->qt_blk_res != 0) {
+ if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
+ if (qtrx->qt_blk_res >
+ qtrx->qt_blk_res_used)
+ dqp->q_res_bcount -= (xfs_qcnt_t)
+ (qtrx->qt_blk_res -
+ qtrx->qt_blk_res_used);
+ else
+ dqp->q_res_bcount -= (xfs_qcnt_t)
+ (qtrx->qt_blk_res_used -
+ qtrx->qt_blk_res);
+ }
+ } else {
+ /*
+ * These blks were never reserved, either inside
+ * a transaction or outside one (in a delayed
+ * allocation). Also, this isn't always a
+ * negative number since we sometimes
+ * deliberately skip quota reservations.
+ */
+ if (qtrx->qt_bcount_delta) {
+ dqp->q_res_bcount +=
+ (xfs_qcnt_t)qtrx->qt_bcount_delta;
+ }
+ }
+ /*
+ * Adjust the RT reservation.
+ */
+ if (qtrx->qt_rtblk_res != 0) {
+ if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) {
+ if (qtrx->qt_rtblk_res >
+ qtrx->qt_rtblk_res_used)
+ dqp->q_res_rtbcount -= (xfs_qcnt_t)
+ (qtrx->qt_rtblk_res -
+ qtrx->qt_rtblk_res_used);
+ else
+ dqp->q_res_rtbcount -= (xfs_qcnt_t)
+ (qtrx->qt_rtblk_res_used -
+ qtrx->qt_rtblk_res);
+ }
+ } else {
+ if (qtrx->qt_rtbcount_delta)
+ dqp->q_res_rtbcount +=
+ (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
+ }
+
+ /*
+ * Adjust the inode reservation.
+ */
+ if (qtrx->qt_ino_res != 0) {
+ ASSERT(qtrx->qt_ino_res >=
+ qtrx->qt_ino_res_used);
+ if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
+ dqp->q_res_icount -= (xfs_qcnt_t)
+ (qtrx->qt_ino_res -
+ qtrx->qt_ino_res_used);
+ } else {
+ if (qtrx->qt_icount_delta)
+ dqp->q_res_icount +=
+ (xfs_qcnt_t)qtrx->qt_icount_delta;
+ }
+
+ ASSERT(dqp->q_res_bcount >=
+ be64_to_cpu(dqp->q_core.d_bcount));
+ ASSERT(dqp->q_res_icount >=
+ be64_to_cpu(dqp->q_core.d_icount));
+ ASSERT(dqp->q_res_rtbcount >=
+ be64_to_cpu(dqp->q_core.d_rtbcount));
+ }
+ /*
+ * Do the group quotas next
+ */
+ qa = tp->t_dqinfo->dqa_grpdquots;
+ }
+}
+
+/*
+ * Release the reservations, and adjust the dquots accordingly.
+ * This is called only when the transaction is being aborted. If by
+ * any chance we have done dquot modifications incore (ie. deltas) already,
+ * we simply throw those away, since that's the expected behavior
+ * when a transaction is curtailed without a commit.
+ */
+void
+xfs_trans_unreserve_and_mod_dquots(
+ xfs_trans_t *tp)
+{
+ int i, j;
+ xfs_dquot_t *dqp;
+ xfs_dqtrx_t *qtrx, *qa;
+ boolean_t locked;
+
+ if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
+ return;
+
+ qa = tp->t_dqinfo->dqa_usrdquots;
+
+ for (j = 0; j < 2; j++) {
+ for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+ qtrx = &qa[i];
+ /*
+ * We assume that the array of dquots is filled
+ * sequentially, not sparsely.
+ */
+ if ((dqp = qtrx->qt_dquot) == NULL)
+ break;
+ /*
+ * Unreserve the original reservation. We don't care
+ * about the number of blocks used field, or deltas.
+ * Also we don't bother to zero the fields.
+ */
+ locked = B_FALSE;
+ if (qtrx->qt_blk_res) {
+ xfs_dqlock(dqp);
+ locked = B_TRUE;
+ dqp->q_res_bcount -=
+ (xfs_qcnt_t)qtrx->qt_blk_res;
+ }
+ if (qtrx->qt_ino_res) {
+ if (!locked) {
+ xfs_dqlock(dqp);
+ locked = B_TRUE;
+ }
+ dqp->q_res_icount -=
+ (xfs_qcnt_t)qtrx->qt_ino_res;
+ }
+
+ if (qtrx->qt_rtblk_res) {
+ if (!locked) {
+ xfs_dqlock(dqp);
+ locked = B_TRUE;
+ }
+ dqp->q_res_rtbcount -=
+ (xfs_qcnt_t)qtrx->qt_rtblk_res;
+ }
+ if (locked)
+ xfs_dqunlock(dqp);
+
+ }
+ qa = tp->t_dqinfo->dqa_grpdquots;
+ }
+}
+
+STATIC void
+xfs_quota_warn(
+ struct xfs_mount *mp,
+ struct xfs_dquot *dqp,
+ int type)
+{
+ /* no warnings for project quotas - we just return ENOSPC later */
+ if (dqp->dq_flags & XFS_DQ_PROJ)
+ return;
+ quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA,
+ be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev,
+ type);
+}
+
+/*
+ * This reserves disk blocks and inodes against a dquot.
+ * Flags indicate if the dquot is to be locked here and also
+ * if the blk reservation is for RT or regular blocks.
+ * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check.
+ */
+STATIC int
+xfs_trans_dqresv(
+ xfs_trans_t *tp,
+ xfs_mount_t *mp,
+ xfs_dquot_t *dqp,
+ long nblks,
+ long ninos,
+ uint flags)
+{
+ xfs_qcnt_t hardlimit;
+ xfs_qcnt_t softlimit;
+ time_t timer;
+ xfs_qwarncnt_t warns;
+ xfs_qwarncnt_t warnlimit;
+ xfs_qcnt_t count;
+ xfs_qcnt_t *resbcountp;
+ xfs_quotainfo_t *q = mp->m_quotainfo;
+
+
+ xfs_dqlock(dqp);
+
+ if (flags & XFS_TRANS_DQ_RES_BLKS) {
+ hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+ if (!hardlimit)
+ hardlimit = q->qi_bhardlimit;
+ softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+ if (!softlimit)
+ softlimit = q->qi_bsoftlimit;
+ timer = be32_to_cpu(dqp->q_core.d_btimer);
+ warns = be16_to_cpu(dqp->q_core.d_bwarns);
+ warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
+ resbcountp = &dqp->q_res_bcount;
+ } else {
+ ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
+ hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
+ if (!hardlimit)
+ hardlimit = q->qi_rtbhardlimit;
+ softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
+ if (!softlimit)
+ softlimit = q->qi_rtbsoftlimit;
+ timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+ warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+ warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
+ resbcountp = &dqp->q_res_rtbcount;
+ }
+
+ if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
+ dqp->q_core.d_id &&
+ ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
+ (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
+ (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
+#ifdef QUOTADEBUG
+ xfs_debug(mp,
+ "BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?",
+ nblks, *resbcountp, hardlimit);
+#endif
+ if (nblks > 0) {
+ /*
+ * dquot is locked already. See if we'd go over the
+ * hardlimit or exceed the timelimit if we allocate
+ * nblks.
+ */
+ if (hardlimit > 0ULL &&
+ hardlimit <= nblks + *resbcountp) {
+ xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
+ goto error_return;
+ }
+ if (softlimit > 0ULL &&
+ softlimit <= nblks + *resbcountp) {
+ if ((timer != 0 && get_seconds() > timer) ||
+ (warns != 0 && warns >= warnlimit)) {
+ xfs_quota_warn(mp, dqp,
+ QUOTA_NL_BSOFTLONGWARN);
+ goto error_return;
+ }
+
+ xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
+ }
+ }
+ if (ninos > 0) {
+ count = be64_to_cpu(dqp->q_core.d_icount);
+ timer = be32_to_cpu(dqp->q_core.d_itimer);
+ warns = be16_to_cpu(dqp->q_core.d_iwarns);
+ warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
+ hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+ if (!hardlimit)
+ hardlimit = q->qi_ihardlimit;
+ softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
+ if (!softlimit)
+ softlimit = q->qi_isoftlimit;
+
+ if (hardlimit > 0ULL && count >= hardlimit) {
+ xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
+ goto error_return;
+ }
+ if (softlimit > 0ULL && count >= softlimit) {
+ if ((timer != 0 && get_seconds() > timer) ||
+ (warns != 0 && warns >= warnlimit)) {
+ xfs_quota_warn(mp, dqp,
+ QUOTA_NL_ISOFTLONGWARN);
+ goto error_return;
+ }
+ xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
+ }
+ }
+ }
+
+ /*
+ * Change the reservation, but not the actual usage.
+ * Note that q_res_bcount = q_core.d_bcount + resv
+ */
+ (*resbcountp) += (xfs_qcnt_t)nblks;
+ if (ninos != 0)
+ dqp->q_res_icount += (xfs_qcnt_t)ninos;
+
+ /*
+ * note the reservation amt in the trans struct too,
+ * so that the transaction knows how much was reserved by
+ * it against this particular dquot.
+ * We don't do this when we are reserving for a delayed allocation,
+ * because we don't have the luxury of a transaction envelope then.
+ */
+ if (tp) {
+ ASSERT(tp->t_dqinfo);
+ ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+ if (nblks != 0)
+ xfs_trans_mod_dquot(tp, dqp,
+ flags & XFS_QMOPT_RESBLK_MASK,
+ nblks);
+ if (ninos != 0)
+ xfs_trans_mod_dquot(tp, dqp,
+ XFS_TRANS_DQ_RES_INOS,
+ ninos);
+ }
+ ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount));
+ ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
+ ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
+
+ xfs_dqunlock(dqp);
+ return 0;
+
+error_return:
+ xfs_dqunlock(dqp);
+ if (flags & XFS_QMOPT_ENOSPC)
+ return ENOSPC;
+ return EDQUOT;
+}
+
+
+/*
+ * Given dquot(s), make disk block and/or inode reservations against them.
+ * The fact that this does the reservation against both the usr and
+ * grp/prj quotas is important, because this follows a both-or-nothing
+ * approach.
+ *
+ * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
+ * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota.
+ * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
+ * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
+ * dquots are unlocked on return, if they were not locked by caller.
+ */
+int
+xfs_trans_reserve_quota_bydquots(
+ xfs_trans_t *tp,
+ xfs_mount_t *mp,
+ xfs_dquot_t *udqp,
+ xfs_dquot_t *gdqp,
+ long nblks,
+ long ninos,
+ uint flags)
+{
+ int resvd = 0, error;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ return 0;
+
+ if (tp && tp->t_dqinfo == NULL)
+ xfs_trans_alloc_dqinfo(tp);
+
+ ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+
+ if (udqp) {
+ error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos,
+ (flags & ~XFS_QMOPT_ENOSPC));
+ if (error)
+ return error;
+ resvd = 1;
+ }
+
+ if (gdqp) {
+ error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
+ if (error) {
+ /*
+ * can't do it, so backout previous reservation
+ */
+ if (resvd) {
+ flags |= XFS_QMOPT_FORCE_RES;
+ xfs_trans_dqresv(tp, mp, udqp,
+ -nblks, -ninos, flags);
+ }
+ return error;
+ }
+ }
+
+ /*
+ * Didn't change anything critical, so, no need to log
+ */
+ return 0;
+}
+
+
+/*
+ * Lock the dquot and change the reservation if we can.
+ * This doesn't change the actual usage, just the reservation.
+ * The inode sent in is locked.
+ */
+int
+xfs_trans_reserve_quota_nblks(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ long nblks,
+ long ninos,
+ uint flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ return 0;
+ if (XFS_IS_PQUOTA_ON(mp))
+ flags |= XFS_QMOPT_ENOSPC;
+
+ ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
+ ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
+ XFS_TRANS_DQ_RES_RTBLKS ||
+ (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
+ XFS_TRANS_DQ_RES_BLKS);
+
+ /*
+ * Reserve nblks against these dquots, with trans as the mediator.
+ */
+ return xfs_trans_reserve_quota_bydquots(tp, mp,
+ ip->i_udquot, ip->i_gdquot,
+ nblks, ninos, flags);
+}
+
+/*
+ * This routine is called to allocate a quotaoff log item.
+ */
+xfs_qoff_logitem_t *
+xfs_trans_get_qoff_item(
+ xfs_trans_t *tp,
+ xfs_qoff_logitem_t *startqoff,
+ uint flags)
+{
+ xfs_qoff_logitem_t *q;
+
+ ASSERT(tp != NULL);
+
+ q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
+ ASSERT(q != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &q->qql_item);
+ return q;
+}
+
+
+/*
+ * This is called to mark the quotaoff logitem as needing
+ * to be logged when the transaction is committed. The logitem must
+ * already be associated with the given transaction.
+ */
+void
+xfs_trans_log_quotaoff_item(
+ xfs_trans_t *tp,
+ xfs_qoff_logitem_t *qlp)
+{
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
+
+STATIC void
+xfs_trans_alloc_dqinfo(
+ xfs_trans_t *tp)
+{
+ tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+}
+
+void
+xfs_trans_free_dqinfo(
+ xfs_trans_t *tp)
+{
+ if (!tp->t_dqinfo)
+ return;
+ kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo);
+ tp->t_dqinfo = NULL;
+}
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
new file mode 100644
index 00000000..b83f76b6
--- /dev/null
+++ b/fs/xfs/support/uuid.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <xfs.h>
+
+/* IRIX interpretation of an uuid_t */
+typedef struct {
+ __be32 uu_timelow;
+ __be16 uu_timemid;
+ __be16 uu_timehi;
+ __be16 uu_clockseq;
+ __be16 uu_node[3];
+} xfs_uu_t;
+
+/*
+ * uuid_getnodeuniq - obtain the node unique fields of a UUID.
+ *
+ * This is not in any way a standard or condoned UUID function;
+ * it just something that's needed for user-level file handles.
+ */
+void
+uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
+{
+ xfs_uu_t *uup = (xfs_uu_t *)uuid;
+
+ fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) |
+ be16_to_cpu(uup->uu_timemid);
+ fsid[1] = be32_to_cpu(uup->uu_timelow);
+}
+
+int
+uuid_is_nil(uuid_t *uuid)
+{
+ int i;
+ char *cp = (char *)uuid;
+
+ if (uuid == NULL)
+ return 0;
+ /* implied check of version number here... */
+ for (i = 0; i < sizeof *uuid; i++)
+ if (*cp++) return 0; /* not nil */
+ return 1; /* is nil */
+}
+
+int
+uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
+{
+ return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
+}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
new file mode 100644
index 00000000..4732d712
--- /dev/null
+++ b/fs/xfs/support/uuid.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SUPPORT_UUID_H__
+#define __XFS_SUPPORT_UUID_H__
+
+typedef struct {
+ unsigned char __u_bits[16];
+} uuid_t;
+
+extern int uuid_is_nil(uuid_t *uuid);
+extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
+extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+
+#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
new file mode 100644
index 00000000..5ad8ad3a
--- /dev/null
+++ b/fs/xfs/xfs.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_H__
+#define __XFS_H__
+
+#ifdef CONFIG_XFS_DEBUG
+#define STATIC
+#define DEBUG 1
+#define XFS_BUF_LOCK_TRACKING 1
+/* #define QUOTADEBUG 1 */
+#endif
+
+#include <linux-2.6/xfs_linux.h>
+#endif /* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
new file mode 100644
index 00000000..11dd7207
--- /dev/null
+++ b/fs/xfs/xfs_acl.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ACL_H__
+#define __XFS_ACL_H__
+
+struct inode;
+struct posix_acl;
+struct xfs_inode;
+
+#define XFS_ACL_MAX_ENTRIES 25
+#define XFS_ACL_NOT_PRESENT (-1)
+
+/* On-disk XFS access control list structure */
+struct xfs_acl {
+ __be32 acl_cnt;
+ struct xfs_acl_entry {
+ __be32 ae_tag;
+ __be32 ae_id;
+ __be16 ae_perm;
+ } acl_entry[XFS_ACL_MAX_ENTRIES];
+};
+
+/* On-disk XFS extended attribute names */
+#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
+#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
+
+#ifdef CONFIG_XFS_POSIX_ACL
+extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags);
+extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
+extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
+extern int xfs_acl_chmod(struct inode *inode);
+extern int posix_acl_access_exists(struct inode *inode);
+extern int posix_acl_default_exists(struct inode *inode);
+
+extern const struct xattr_handler xfs_xattr_acl_access_handler;
+extern const struct xattr_handler xfs_xattr_acl_default_handler;
+#else
+# define xfs_check_acl NULL
+# define xfs_get_acl(inode, type) NULL
+# define xfs_inherit_acl(inode, default_acl) 0
+# define xfs_acl_chmod(inode) 0
+# define posix_acl_access_exists(inode) 0
+# define posix_acl_default_exists(inode) 0
+#endif /* CONFIG_XFS_POSIX_ACL */
+#endif /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
new file mode 100644
index 00000000..6530769a
--- /dev/null
+++ b/fs/xfs/xfs_ag.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_AG_H__
+#define __XFS_AG_H__
+
+/*
+ * Allocation group header
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_trans;
+
+#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */
+#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */
+#define XFS_AGF_VERSION 1
+#define XFS_AGI_VERSION 1
+
+#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION)
+#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION)
+
+/*
+ * Btree number 0 is bno, 1 is cnt. This value gives the size of the
+ * arrays below.
+ */
+#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1)
+
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number. Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+
+typedef struct xfs_agf {
+ /*
+ * Common allocation group header information
+ */
+ __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */
+ __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */
+ __be32 agf_seqno; /* sequence # starting from 0 */
+ __be32 agf_length; /* size in blocks of a.g. */
+ /*
+ * Freespace information
+ */
+ __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */
+ __be32 agf_spare0; /* spare field */
+ __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */
+ __be32 agf_spare1; /* spare field */
+ __be32 agf_flfirst; /* first freelist block's index */
+ __be32 agf_fllast; /* last freelist block's index */
+ __be32 agf_flcount; /* count of blocks in freelist */
+ __be32 agf_freeblks; /* total free blocks */
+ __be32 agf_longest; /* longest free space */
+ __be32 agf_btreeblks; /* # of blocks held in AGF btrees */
+} xfs_agf_t;
+
+#define XFS_AGF_MAGICNUM 0x00000001
+#define XFS_AGF_VERSIONNUM 0x00000002
+#define XFS_AGF_SEQNO 0x00000004
+#define XFS_AGF_LENGTH 0x00000008
+#define XFS_AGF_ROOTS 0x00000010
+#define XFS_AGF_LEVELS 0x00000020
+#define XFS_AGF_FLFIRST 0x00000040
+#define XFS_AGF_FLLAST 0x00000080
+#define XFS_AGF_FLCOUNT 0x00000100
+#define XFS_AGF_FREEBLKS 0x00000200
+#define XFS_AGF_LONGEST 0x00000400
+#define XFS_AGF_BTREEBLKS 0x00000800
+#define XFS_AGF_NUM_BITS 12
+#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
+
+#define XFS_AGF_FLAGS \
+ { XFS_AGF_MAGICNUM, "MAGICNUM" }, \
+ { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \
+ { XFS_AGF_SEQNO, "SEQNO" }, \
+ { XFS_AGF_LENGTH, "LENGTH" }, \
+ { XFS_AGF_ROOTS, "ROOTS" }, \
+ { XFS_AGF_LEVELS, "LEVELS" }, \
+ { XFS_AGF_FLFIRST, "FLFIRST" }, \
+ { XFS_AGF_FLLAST, "FLLAST" }, \
+ { XFS_AGF_FLCOUNT, "FLCOUNT" }, \
+ { XFS_AGF_FREEBLKS, "FREEBLKS" }, \
+ { XFS_AGF_LONGEST, "LONGEST" }, \
+ { XFS_AGF_BTREEBLKS, "BTREEBLKS" }
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
+#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
+#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp))
+
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define XFS_AGI_UNLINKED_BUCKETS 64
+
+typedef struct xfs_agi {
+ /*
+ * Common allocation group header information
+ */
+ __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */
+ __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */
+ __be32 agi_seqno; /* sequence # starting from 0 */
+ __be32 agi_length; /* size in blocks of a.g. */
+ /*
+ * Inode information
+ * Inodes are mapped by interpreting the inode number, so no
+ * mapping data is needed here.
+ */
+ __be32 agi_count; /* count of allocated inodes */
+ __be32 agi_root; /* root of inode btree */
+ __be32 agi_level; /* levels in inode btree */
+ __be32 agi_freecount; /* number of free inodes */
+ __be32 agi_newino; /* new inode just allocated */
+ __be32 agi_dirino; /* last directory inode chunk */
+ /*
+ * Hash table of inodes which have been unlinked but are
+ * still being referenced.
+ */
+ __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+} xfs_agi_t;
+
+#define XFS_AGI_MAGICNUM 0x00000001
+#define XFS_AGI_VERSIONNUM 0x00000002
+#define XFS_AGI_SEQNO 0x00000004
+#define XFS_AGI_LENGTH 0x00000008
+#define XFS_AGI_COUNT 0x00000010
+#define XFS_AGI_ROOT 0x00000020
+#define XFS_AGI_LEVEL 0x00000040
+#define XFS_AGI_FREECOUNT 0x00000080
+#define XFS_AGI_NEWINO 0x00000100
+#define XFS_AGI_DIRINO 0x00000200
+#define XFS_AGI_UNLINKED 0x00000400
+#define XFS_AGI_NUM_BITS 11
+#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1)
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
+#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
+#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp))
+
+extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, struct xfs_buf **bpp);
+
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
+#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
+#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
+#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp))
+
+typedef struct xfs_agfl {
+ __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */
+} xfs_agfl_t;
+
+/*
+ * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_alloc_busy_insert() for details.
+ */
+struct xfs_busy_extent {
+ struct rb_node rb_node; /* ag by-bno indexed search tree */
+ struct list_head list; /* transaction busy extent list */
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t length;
+ unsigned int flags;
+#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
+#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */
+};
+
+/*
+ * Per-ag incore structure, copies of information in agf and agi,
+ * to improve the performance of allocation group selection.
+ */
+#define XFS_PAGB_NUM_SLOTS 128
+
+typedef struct xfs_perag {
+ struct xfs_mount *pag_mount; /* owner filesystem */
+ xfs_agnumber_t pag_agno; /* AG this structure belongs to */
+ atomic_t pag_ref; /* perag reference count */
+ char pagf_init; /* this agf's entry is initialized */
+ char pagi_init; /* this agi's entry is initialized */
+ char pagf_metadata; /* the agf is preferred to be metadata */
+ char pagi_inodeok; /* The agi is ok for inodes */
+ __uint8_t pagf_levels[XFS_BTNUM_AGF];
+ /* # of levels in bno & cnt btree */
+ __uint32_t pagf_flcount; /* count of blocks in freelist */
+ xfs_extlen_t pagf_freeblks; /* total free blocks */
+ xfs_extlen_t pagf_longest; /* longest free space */
+ __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
+ xfs_agino_t pagi_freecount; /* number of free inodes */
+ xfs_agino_t pagi_count; /* number of allocated inodes */
+
+ /*
+ * Inode allocation search lookup optimisation.
+ * If the pagino matches, the search for new inodes
+ * doesn't need to search the near ones again straight away
+ */
+ xfs_agino_t pagl_pagino;
+ xfs_agino_t pagl_leftrec;
+ xfs_agino_t pagl_rightrec;
+#ifdef __KERNEL__
+ spinlock_t pagb_lock; /* lock for pagb_tree */
+ struct rb_root pagb_tree; /* ordered tree of busy extents */
+
+ atomic_t pagf_fstrms; /* # of filestreams active in this AG */
+
+ spinlock_t pag_ici_lock; /* incore inode cache lock */
+ struct radix_tree_root pag_ici_root; /* incore inode cache root */
+ int pag_ici_reclaimable; /* reclaimable inodes */
+ struct mutex pag_ici_reclaim_lock; /* serialisation point */
+ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
+
+ /* buffer cache index */
+ spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
+ struct rb_root pag_buf_tree; /* ordered tree of active buffers */
+
+ /* for rcu-safe freeing */
+ struct rcu_head rcu_head;
+#endif
+ int pagb_count; /* pagb slots in use */
+} xfs_perag_t;
+
+/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
+ in xfs_inode_ag_iterator */
+#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
+
+#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
+#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
+ (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
+#define XFS_MIN_FREELIST(a,mp) \
+ (XFS_MIN_FREELIST_RAW( \
+ be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
+ be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
+#define XFS_MIN_FREELIST_PAG(pag,mp) \
+ (XFS_MIN_FREELIST_RAW( \
+ (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+ (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+
+#define XFS_AGB_TO_FSB(mp,agno,agbno) \
+ (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#define XFS_FSB_TO_AGNO(mp,fsbno) \
+ ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#define XFS_FSB_TO_AGBNO(mp,fsbno) \
+ ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
+#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
+ ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
+ (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
+#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#define XFS_AG_CHECK_DADDR(mp,d,len) \
+ ((len) == 1 ? \
+ ASSERT((d) == XFS_SB_DADDR || \
+ xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+ ASSERT(xfs_daddr_to_agno(mp, d) == \
+ xfs_daddr_to_agno(mp, (d) + (len) - 1)))
+
+#endif /* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
new file mode 100644
index 00000000..95862bbf
--- /dev/null
+++ b/fs/xfs/xfs_alloc.c
@@ -0,0 +1,3047 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+
+#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
+
+#define XFSA_FIXUP_BNO_OK 1
+#define XFSA_FIXUP_CNT_OK 2
+
+STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
+ xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
+ xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int /* error */
+xfs_alloc_lookup_eq(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int /* error */
+xfs_alloc_lookup_ge(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int /* error */
+xfs_alloc_lookup_le(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int /* error */
+xfs_alloc_update(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len) /* length of extent */
+{
+ union xfs_btree_rec rec;
+
+ rec.alloc.ar_startblock = cpu_to_be32(bno);
+ rec.alloc.ar_blockcount = cpu_to_be32(len);
+ return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int /* error */
+xfs_alloc_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t *bno, /* output: starting block of extent */
+ xfs_extlen_t *len, /* output: length of extent */
+ int *stat) /* output: success/failure */
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (!error && *stat == 1) {
+ *bno = be32_to_cpu(rec->alloc.ar_startblock);
+ *len = be32_to_cpu(rec->alloc.ar_blockcount);
+ }
+ return error;
+}
+
+/*
+ * Compute aligned version of the found extent.
+ * Takes alignment and min length into account.
+ */
+STATIC void
+xfs_alloc_compute_aligned(
+ xfs_alloc_arg_t *args, /* allocation argument structure */
+ xfs_agblock_t foundbno, /* starting block in found extent */
+ xfs_extlen_t foundlen, /* length in found extent */
+ xfs_agblock_t *resbno, /* result block number */
+ xfs_extlen_t *reslen) /* result length */
+{
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
+
+ /* Trim busy sections out of found extent */
+ xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
+
+ if (args->alignment > 1 && len >= args->minlen) {
+ xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
+ xfs_extlen_t diff = aligned_bno - bno;
+
+ *resbno = aligned_bno;
+ *reslen = diff >= len ? 0 : len - diff;
+ } else {
+ *resbno = bno;
+ *reslen = len;
+ }
+}
+
+/*
+ * Compute best start block and diff for "near" allocations.
+ * freelen >= wantlen already checked by caller.
+ */
+STATIC xfs_extlen_t /* difference value (absolute) */
+xfs_alloc_compute_diff(
+ xfs_agblock_t wantbno, /* target starting block */
+ xfs_extlen_t wantlen, /* target length */
+ xfs_extlen_t alignment, /* target alignment */
+ xfs_agblock_t freebno, /* freespace's starting block */
+ xfs_extlen_t freelen, /* freespace's length */
+ xfs_agblock_t *newbnop) /* result: best start block from free */
+{
+ xfs_agblock_t freeend; /* end of freespace extent */
+ xfs_agblock_t newbno1; /* return block number */
+ xfs_agblock_t newbno2; /* other new block number */
+ xfs_extlen_t newlen1=0; /* length with newbno1 */
+ xfs_extlen_t newlen2=0; /* length with newbno2 */
+ xfs_agblock_t wantend; /* end of target extent */
+
+ ASSERT(freelen >= wantlen);
+ freeend = freebno + freelen;
+ wantend = wantbno + wantlen;
+ if (freebno >= wantbno) {
+ if ((newbno1 = roundup(freebno, alignment)) >= freeend)
+ newbno1 = NULLAGBLOCK;
+ } else if (freeend >= wantend && alignment > 1) {
+ newbno1 = roundup(wantbno, alignment);
+ newbno2 = newbno1 - alignment;
+ if (newbno1 >= freeend)
+ newbno1 = NULLAGBLOCK;
+ else
+ newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
+ if (newbno2 < freebno)
+ newbno2 = NULLAGBLOCK;
+ else
+ newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
+ if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
+ if (newlen1 < newlen2 ||
+ (newlen1 == newlen2 &&
+ XFS_ABSDIFF(newbno1, wantbno) >
+ XFS_ABSDIFF(newbno2, wantbno)))
+ newbno1 = newbno2;
+ } else if (newbno2 != NULLAGBLOCK)
+ newbno1 = newbno2;
+ } else if (freeend >= wantend) {
+ newbno1 = wantbno;
+ } else if (alignment > 1) {
+ newbno1 = roundup(freeend - wantlen, alignment);
+ if (newbno1 > freeend - wantlen &&
+ newbno1 - alignment >= freebno)
+ newbno1 -= alignment;
+ else if (newbno1 >= freeend)
+ newbno1 = NULLAGBLOCK;
+ } else
+ newbno1 = freeend - wantlen;
+ *newbnop = newbno1;
+ return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+}
+
+/*
+ * Fix up the length, based on mod and prod.
+ * len should be k * prod + mod for some k.
+ * If len is too small it is returned unchanged.
+ * If len hits maxlen it is left alone.
+ */
+STATIC void
+xfs_alloc_fix_len(
+ xfs_alloc_arg_t *args) /* allocation argument structure */
+{
+ xfs_extlen_t k;
+ xfs_extlen_t rlen;
+
+ ASSERT(args->mod < args->prod);
+ rlen = args->len;
+ ASSERT(rlen >= args->minlen);
+ ASSERT(rlen <= args->maxlen);
+ if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+ (args->mod == 0 && rlen < args->prod))
+ return;
+ k = rlen % args->prod;
+ if (k == args->mod)
+ return;
+ if (k > args->mod) {
+ if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
+ return;
+ } else {
+ if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
+ (int)args->minlen)
+ return;
+ }
+ ASSERT(rlen >= args->minlen);
+ ASSERT(rlen <= args->maxlen);
+ args->len = rlen;
+}
+
+/*
+ * Fix up length if there is too little space left in the a.g.
+ * Return 1 if ok, 0 if too little, should give up.
+ */
+STATIC int
+xfs_alloc_fix_minleft(
+ xfs_alloc_arg_t *args) /* allocation argument structure */
+{
+ xfs_agf_t *agf; /* a.g. freelist header */
+ int diff; /* free space difference */
+
+ if (args->minleft == 0)
+ return 1;
+ agf = XFS_BUF_TO_AGF(args->agbp);
+ diff = be32_to_cpu(agf->agf_freeblks)
+ - args->len - args->minleft;
+ if (diff >= 0)
+ return 1;
+ args->len += diff; /* shrink the allocated space */
+ if (args->len >= args->minlen)
+ return 1;
+ args->agbno = NULLAGBLOCK;
+ return 0;
+}
+
+/*
+ * Update the two btrees, logically removing from freespace the extent
+ * starting at rbno, rlen blocks. The extent is contained within the
+ * actual (current) free extent fbno for flen blocks.
+ * Flags are passed in indicating whether the cursors are set to the
+ * relevant records.
+ */
+STATIC int /* error code */
+xfs_alloc_fixup_trees(
+ xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */
+ xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */
+ xfs_agblock_t fbno, /* starting block of free extent */
+ xfs_extlen_t flen, /* length of free extent */
+ xfs_agblock_t rbno, /* starting block of returned extent */
+ xfs_extlen_t rlen, /* length of returned extent */
+ int flags) /* flags, XFSA_FIXUP_... */
+{
+ int error; /* error code */
+ int i; /* operation results */
+ xfs_agblock_t nfbno1; /* first new free startblock */
+ xfs_agblock_t nfbno2; /* second new free startblock */
+ xfs_extlen_t nflen1=0; /* first new free length */
+ xfs_extlen_t nflen2=0; /* second new free length */
+
+ /*
+ * Look up the record in the by-size tree if necessary.
+ */
+ if (flags & XFSA_FIXUP_CNT_OK) {
+#ifdef DEBUG
+ if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(
+ i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+ } else {
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+ /*
+ * Look up the record in the by-block tree if necessary.
+ */
+ if (flags & XFSA_FIXUP_BNO_OK) {
+#ifdef DEBUG
+ if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(
+ i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+ } else {
+ if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+
+#ifdef DEBUG
+ if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+ struct xfs_btree_block *bnoblock;
+ struct xfs_btree_block *cntblock;
+
+ bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
+ cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+
+ XFS_WANT_CORRUPTED_RETURN(
+ bnoblock->bb_numrecs == cntblock->bb_numrecs);
+ }
+#endif
+
+ /*
+ * Deal with all four cases: the allocated record is contained
+ * within the freespace record, so we can have new freespace
+ * at either (or both) end, or no freespace remaining.
+ */
+ if (rbno == fbno && rlen == flen)
+ nfbno1 = nfbno2 = NULLAGBLOCK;
+ else if (rbno == fbno) {
+ nfbno1 = rbno + rlen;
+ nflen1 = flen - rlen;
+ nfbno2 = NULLAGBLOCK;
+ } else if (rbno + rlen == fbno + flen) {
+ nfbno1 = fbno;
+ nflen1 = flen - rlen;
+ nfbno2 = NULLAGBLOCK;
+ } else {
+ nfbno1 = fbno;
+ nflen1 = rbno - fbno;
+ nfbno2 = rbno + rlen;
+ nflen2 = (fbno + flen) - nfbno2;
+ }
+ /*
+ * Delete the entry from the by-size btree.
+ */
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ /*
+ * Add new by-size btree entry(s).
+ */
+ if (nfbno1 != NULLAGBLOCK) {
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 0);
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+ if (nfbno2 != NULLAGBLOCK) {
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 0);
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+ /*
+ * Fix up the by-block btree entry(s).
+ */
+ if (nfbno1 == NULLAGBLOCK) {
+ /*
+ * No remaining freespace, just delete the by-block tree entry.
+ */
+ if ((error = xfs_btree_delete(bno_cur, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ } else {
+ /*
+ * Update the by-block entry to start later|be shorter.
+ */
+ if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
+ return error;
+ }
+ if (nfbno2 != NULLAGBLOCK) {
+ /*
+ * 2 resulting free entries, need to add one.
+ */
+ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 0);
+ if ((error = xfs_btree_insert(bno_cur, &i)))
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+ return 0;
+}
+
+/*
+ * Read in the allocation group free block array.
+ */
+STATIC int /* error */
+xfs_alloc_read_agfl(
+ xfs_mount_t *mp, /* mount point structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_buf_t **bpp) /* buffer for the ag free block array */
+{
+ xfs_buf_t *bp; /* return value */
+ int error;
+
+ ASSERT(agno != NULLAGNUMBER);
+ error = xfs_trans_read_buf(
+ mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ if (error)
+ return error;
+ ASSERT(bp);
+ ASSERT(!XFS_BUF_GETERROR(bp));
+ XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
+ *bpp = bp;
+ return 0;
+}
+
+STATIC int
+xfs_alloc_update_counters(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_buf *agbp,
+ long len)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+
+ pag->pagf_freeblks += len;
+ be32_add_cpu(&agf->agf_freeblks, len);
+
+ xfs_trans_agblocks_delta(tp, len);
+ if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+ be32_to_cpu(agf->agf_length)))
+ return EFSCORRUPTED;
+
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+ return 0;
+}
+
+/*
+ * Allocation group level functions.
+ */
+
+/*
+ * Allocate a variable extent in the allocation group agno.
+ * Type and bno are used to determine where in the allocation group the
+ * extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent(
+ xfs_alloc_arg_t *args) /* argument structure for allocation */
+{
+ int error=0;
+
+ ASSERT(args->minlen > 0);
+ ASSERT(args->maxlen > 0);
+ ASSERT(args->minlen <= args->maxlen);
+ ASSERT(args->mod < args->prod);
+ ASSERT(args->alignment > 0);
+ /*
+ * Branch to correct routine based on the type.
+ */
+ args->wasfromfl = 0;
+ switch (args->type) {
+ case XFS_ALLOCTYPE_THIS_AG:
+ error = xfs_alloc_ag_vextent_size(args);
+ break;
+ case XFS_ALLOCTYPE_NEAR_BNO:
+ error = xfs_alloc_ag_vextent_near(args);
+ break;
+ case XFS_ALLOCTYPE_THIS_BNO:
+ error = xfs_alloc_ag_vextent_exact(args);
+ break;
+ default:
+ ASSERT(0);
+ /* NOTREACHED */
+ }
+
+ if (error || args->agbno == NULLAGBLOCK)
+ return error;
+
+ ASSERT(args->len >= args->minlen);
+ ASSERT(args->len <= args->maxlen);
+ ASSERT(!args->wasfromfl || !args->isfl);
+ ASSERT(args->agbno % args->alignment == 0);
+
+ if (!args->wasfromfl) {
+ error = xfs_alloc_update_counters(args->tp, args->pag,
+ args->agbp,
+ -((long)(args->len)));
+ if (error)
+ return error;
+
+ ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
+ args->agbno, args->len));
+ }
+
+ if (!args->isfl) {
+ xfs_trans_mod_sb(args->tp, args->wasdel ?
+ XFS_TRANS_SB_RES_FDBLOCKS :
+ XFS_TRANS_SB_FDBLOCKS,
+ -((long)(args->len)));
+ }
+
+ XFS_STATS_INC(xs_allocx);
+ XFS_STATS_ADD(xs_allocb, args->len);
+ return error;
+}
+
+/*
+ * Allocate a variable extent at exactly agno/bno.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent_exact(
+ xfs_alloc_arg_t *args) /* allocation argument structure */
+{
+ xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
+ xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
+ int error;
+ xfs_agblock_t fbno; /* start block of found extent */
+ xfs_extlen_t flen; /* length of found extent */
+ xfs_agblock_t tbno; /* start block of trimmed extent */
+ xfs_extlen_t tlen; /* length of trimmed extent */
+ xfs_agblock_t tend; /* end block of trimmed extent */
+ xfs_agblock_t end; /* end of allocated extent */
+ int i; /* success/failure of operation */
+ xfs_extlen_t rlen; /* length of returned extent */
+
+ ASSERT(args->alignment == 1);
+
+ /*
+ * Allocate/initialize a cursor for the by-number freespace btree.
+ */
+ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_BNO);
+
+ /*
+ * Lookup bno and minlen in the btree (minlen is irrelevant, really).
+ * Look for the closest free block <= bno, it must contain bno
+ * if any free block does.
+ */
+ error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+ if (error)
+ goto error0;
+ if (!i)
+ goto not_found;
+
+ /*
+ * Grab the freespace record.
+ */
+ error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ ASSERT(fbno <= args->agbno);
+
+ /*
+ * Check for overlapping busy extents.
+ */
+ xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
+
+ /*
+ * Give up if the start of the extent is busy, or the freespace isn't
+ * long enough for the minimum request.
+ */
+ if (tbno > args->agbno)
+ goto not_found;
+ if (tlen < args->minlen)
+ goto not_found;
+ tend = tbno + tlen;
+ if (tend < args->agbno + args->minlen)
+ goto not_found;
+
+ /*
+ * End of extent will be smaller of the freespace end and the
+ * maximal requested end.
+ *
+ * Fix the length according to mod and prod if given.
+ */
+ end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
+ args->len = end - args->agbno;
+ xfs_alloc_fix_len(args);
+ if (!xfs_alloc_fix_minleft(args))
+ goto not_found;
+
+ rlen = args->len;
+ ASSERT(args->agbno + rlen <= tend);
+ end = args->agbno + rlen;
+
+ /*
+ * We are allocating agbno for rlen [agbno .. end]
+ * Allocate/initialize a cursor for the by-size btree.
+ */
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_CNT);
+ ASSERT(args->agbno + args->len <=
+ be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+ args->len, XFSA_FIXUP_BNO_OK);
+ if (error) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+ goto error0;
+ }
+
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+ args->wasfromfl = 0;
+ trace_xfs_alloc_exact_done(args);
+ return 0;
+
+not_found:
+ /* Didn't find it, return null. */
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+ args->agbno = NULLAGBLOCK;
+ trace_xfs_alloc_exact_notfound(args);
+ return 0;
+
+error0:
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+ trace_xfs_alloc_exact_error(args);
+ return error;
+}
+
+/*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ struct xfs_btree_cur **gcur, /* good cursor */
+ struct xfs_btree_cur **scur, /* searching cursor */
+ xfs_agblock_t gdiff, /* difference for search comparison */
+ xfs_agblock_t *sbno, /* extent found by search */
+ xfs_extlen_t *slen, /* extent length */
+ xfs_agblock_t *sbnoa, /* aligned extent found by search */
+ xfs_extlen_t *slena, /* aligned extent length */
+ int dir) /* 0 = search right, 1 = search left */
+{
+ xfs_agblock_t new;
+ xfs_agblock_t sdiff;
+ int error;
+ int i;
+
+ /* The good extent is perfect, no need to search. */
+ if (!gdiff)
+ goto out_use_good;
+
+ /*
+ * Look until we find a better one, run out of space or run off the end.
+ */
+ do {
+ error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+
+ /*
+ * The good extent is closer than this one.
+ */
+ if (!dir) {
+ if (*sbnoa >= args->agbno + gdiff)
+ goto out_use_good;
+ } else {
+ if (*sbnoa <= args->agbno - gdiff)
+ goto out_use_good;
+ }
+
+ /*
+ * Same distance, compare length and pick the best.
+ */
+ if (*slena >= args->minlen) {
+ args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+ xfs_alloc_fix_len(args);
+
+ sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, *sbnoa,
+ *slena, &new);
+
+ /*
+ * Choose closer size and invalidate other cursor.
+ */
+ if (sdiff < gdiff)
+ goto out_use_search;
+ goto out_use_good;
+ }
+
+ if (!dir)
+ error = xfs_btree_increment(*scur, 0, &i);
+ else
+ error = xfs_btree_decrement(*scur, 0, &i);
+ if (error)
+ goto error0;
+ } while (i);
+
+out_use_good:
+ xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+ *scur = NULL;
+ return 0;
+
+out_use_search:
+ xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+ *gcur = NULL;
+ return 0;
+
+error0:
+ /* caller invalidates cursors */
+ return error;
+}
+
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent_near(
+ xfs_alloc_arg_t *args) /* allocation argument structure */
+{
+ xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */
+ xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */
+ xfs_btree_cur_t *cnt_cur; /* cursor for count btree */
+ xfs_agblock_t gtbno; /* start bno of right side entry */
+ xfs_agblock_t gtbnoa; /* aligned ... */
+ xfs_extlen_t gtdiff; /* difference to right side entry */
+ xfs_extlen_t gtlen; /* length of right side entry */
+ xfs_extlen_t gtlena; /* aligned ... */
+ xfs_agblock_t gtnew; /* useful start bno of right side */
+ int error; /* error code */
+ int i; /* result code, temporary */
+ int j; /* result code, temporary */
+ xfs_agblock_t ltbno; /* start bno of left side entry */
+ xfs_agblock_t ltbnoa; /* aligned ... */
+ xfs_extlen_t ltdiff; /* difference to left side entry */
+ xfs_extlen_t ltlen; /* length of left side entry */
+ xfs_extlen_t ltlena; /* aligned ... */
+ xfs_agblock_t ltnew; /* useful start bno of left side */
+ xfs_extlen_t rlen; /* length of returned extent */
+ int forced = 0;
+#if defined(DEBUG) && defined(__KERNEL__)
+ /*
+ * Randomly don't execute the first algorithm.
+ */
+ int dofirst; /* set to do first algorithm */
+
+ dofirst = random32() & 1;
+#endif
+
+restart:
+ bno_cur_lt = NULL;
+ bno_cur_gt = NULL;
+ ltlen = 0;
+ gtlena = 0;
+ ltlena = 0;
+
+ /*
+ * Get a cursor for the by-size btree.
+ */
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_CNT);
+
+ /*
+ * See if there are any free extents as big as maxlen.
+ */
+ if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
+ goto error0;
+ /*
+ * If none, then pick up the last entry in the tree unless the
+ * tree is empty.
+ */
+ if (!i) {
+ if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
+ &ltlen, &i)))
+ goto error0;
+ if (i == 0 || ltlen == 0) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_near_noentry(args);
+ return 0;
+ }
+ ASSERT(i == 1);
+ }
+ args->wasfromfl = 0;
+
+ /*
+ * First algorithm.
+ * If the requested extent is large wrt the freespaces available
+ * in this a.g., then the cursor will be pointing to a btree entry
+ * near the right edge of the tree. If it's in the last btree leaf
+ * block, then we just examine all the entries in that block
+ * that are big enough, and pick the best one.
+ * This is written as a while loop so we can break out of it,
+ * but we never loop back to the top.
+ */
+ while (xfs_btree_islastblock(cnt_cur, 0)) {
+ xfs_extlen_t bdiff;
+ int besti=0;
+ xfs_extlen_t blen=0;
+ xfs_agblock_t bnew=0;
+
+#if defined(DEBUG) && defined(__KERNEL__)
+ if (!dofirst)
+ break;
+#endif
+ /*
+ * Start from the entry that lookup found, sequence through
+ * all larger free blocks. If we're actually pointing at a
+ * record smaller than maxlen, go to the start of this block,
+ * and skip all those smaller than minlen.
+ */
+ if (ltlen || args->alignment > 1) {
+ cnt_cur->bc_ptrs[0] = 1;
+ do {
+ if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
+ &ltlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if (ltlen >= args->minlen)
+ break;
+ if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
+ goto error0;
+ } while (i);
+ ASSERT(ltlen >= args->minlen);
+ if (!i)
+ break;
+ }
+ i = cnt_cur->bc_ptrs[0];
+ for (j = 1, blen = 0, bdiff = 0;
+ !error && j && (blen < args->maxlen || bdiff > 0);
+ error = xfs_btree_increment(cnt_cur, 0, &j)) {
+ /*
+ * For each entry, decide if it's better than
+ * the previous best entry.
+ */
+ if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ xfs_alloc_compute_aligned(args, ltbno, ltlen,
+ &ltbnoa, &ltlena);
+ if (ltlena < args->minlen)
+ continue;
+ args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+ xfs_alloc_fix_len(args);
+ ASSERT(args->len >= args->minlen);
+ if (args->len < blen)
+ continue;
+ ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, ltbnoa, ltlena, &ltnew);
+ if (ltnew != NULLAGBLOCK &&
+ (args->len > blen || ltdiff < bdiff)) {
+ bdiff = ltdiff;
+ bnew = ltnew;
+ blen = args->len;
+ besti = cnt_cur->bc_ptrs[0];
+ }
+ }
+ /*
+ * It didn't work. We COULD be in a case where
+ * there's a good record somewhere, so try again.
+ */
+ if (blen == 0)
+ break;
+ /*
+ * Point at the best entry, and retrieve it again.
+ */
+ cnt_cur->bc_ptrs[0] = besti;
+ if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ args->len = blen;
+ if (!xfs_alloc_fix_minleft(args)) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_near_nominleft(args);
+ return 0;
+ }
+ blen = args->len;
+ /*
+ * We are allocating starting at bnew for blen blocks.
+ */
+ args->agbno = bnew;
+ ASSERT(bnew >= ltbno);
+ ASSERT(bnew + blen <= ltbno + ltlen);
+ /*
+ * Set up a cursor for the by-bno tree.
+ */
+ bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_BNO);
+ /*
+ * Fix up the btree entries.
+ */
+ if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
+ ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
+ goto error0;
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+
+ trace_xfs_alloc_near_first(args);
+ return 0;
+ }
+ /*
+ * Second algorithm.
+ * Search in the by-bno tree to the left and to the right
+ * simultaneously, until in each case we find a space big enough,
+ * or run into the edge of the tree. When we run into the edge,
+ * we deallocate that cursor.
+ * If both searches succeed, we compare the two spaces and pick
+ * the better one.
+ * With alignment, it's possible for both to fail; the upper
+ * level algorithm that picks allocation groups for allocations
+ * is not supposed to do this.
+ */
+ /*
+ * Allocate and initialize the cursor for the leftward search.
+ */
+ bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_BNO);
+ /*
+ * Lookup <= bno to find the leftward search's starting point.
+ */
+ if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
+ goto error0;
+ if (!i) {
+ /*
+ * Didn't find anything; use this cursor for the rightward
+ * search.
+ */
+ bno_cur_gt = bno_cur_lt;
+ bno_cur_lt = NULL;
+ }
+ /*
+ * Found something. Duplicate the cursor for the rightward search.
+ */
+ else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
+ goto error0;
+ /*
+ * Increment the cursor, so we will point at the entry just right
+ * of the leftward entry if any, or to the leftmost entry.
+ */
+ if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+ goto error0;
+ if (!i) {
+ /*
+ * It failed, there are no rightward entries.
+ */
+ xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
+ bno_cur_gt = NULL;
+ }
+ /*
+ * Loop going left with the leftward cursor, right with the
+ * rightward cursor, until either both directions give up or
+ * we find an entry at least as big as minlen.
+ */
+ do {
+ if (bno_cur_lt) {
+ if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ xfs_alloc_compute_aligned(args, ltbno, ltlen,
+ &ltbnoa, &ltlena);
+ if (ltlena >= args->minlen)
+ break;
+ if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
+ goto error0;
+ if (!i) {
+ xfs_btree_del_cursor(bno_cur_lt,
+ XFS_BTREE_NOERROR);
+ bno_cur_lt = NULL;
+ }
+ }
+ if (bno_cur_gt) {
+ if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ xfs_alloc_compute_aligned(args, gtbno, gtlen,
+ &gtbnoa, &gtlena);
+ if (gtlena >= args->minlen)
+ break;
+ if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
+ goto error0;
+ if (!i) {
+ xfs_btree_del_cursor(bno_cur_gt,
+ XFS_BTREE_NOERROR);
+ bno_cur_gt = NULL;
+ }
+ }
+ } while (bno_cur_lt || bno_cur_gt);
+
+ /*
+ * Got both cursors still active, need to find better entry.
+ */
+ if (bno_cur_lt && bno_cur_gt) {
+ if (ltlena >= args->minlen) {
+ /*
+ * Left side is good, look for a right side entry.
+ */
+ args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+ xfs_alloc_fix_len(args);
+ ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, ltbnoa, ltlena, &ltnew);
+
+ error = xfs_alloc_find_best_extent(args,
+ &bno_cur_lt, &bno_cur_gt,
+ ltdiff, &gtbno, &gtlen,
+ &gtbnoa, &gtlena,
+ 0 /* search right */);
+ } else {
+ ASSERT(gtlena >= args->minlen);
+
+ /*
+ * Right side is good, look for a left side entry.
+ */
+ args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
+ xfs_alloc_fix_len(args);
+ gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, gtbnoa, gtlena, &gtnew);
+
+ error = xfs_alloc_find_best_extent(args,
+ &bno_cur_gt, &bno_cur_lt,
+ gtdiff, &ltbno, &ltlen,
+ &ltbnoa, &ltlena,
+ 1 /* search left */);
+ }
+
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * If we couldn't get anything, give up.
+ */
+ if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+ if (!forced++) {
+ trace_xfs_alloc_near_busy(args);
+ xfs_log_force(args->mp, XFS_LOG_SYNC);
+ goto restart;
+ }
+
+ trace_xfs_alloc_size_neither(args);
+ args->agbno = NULLAGBLOCK;
+ return 0;
+ }
+
+ /*
+ * At this point we have selected a freespace entry, either to the
+ * left or to the right. If it's on the right, copy all the
+ * useful variables to the "left" set so we only have one
+ * copy of this code.
+ */
+ if (bno_cur_gt) {
+ bno_cur_lt = bno_cur_gt;
+ bno_cur_gt = NULL;
+ ltbno = gtbno;
+ ltbnoa = gtbnoa;
+ ltlen = gtlen;
+ ltlena = gtlena;
+ j = 1;
+ } else
+ j = 0;
+
+ /*
+ * Fix up the length and compute the useful address.
+ */
+ args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+ xfs_alloc_fix_len(args);
+ if (!xfs_alloc_fix_minleft(args)) {
+ trace_xfs_alloc_near_nominleft(args);
+ xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ return 0;
+ }
+ rlen = args->len;
+ (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
+ ltbnoa, ltlena, &ltnew);
+ ASSERT(ltnew >= ltbno);
+ ASSERT(ltnew + rlen <= ltbnoa + ltlena);
+ ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ args->agbno = ltnew;
+
+ if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
+ ltnew, rlen, XFSA_FIXUP_BNO_OK)))
+ goto error0;
+
+ if (j)
+ trace_xfs_alloc_near_greater(args);
+ else
+ trace_xfs_alloc_near_lesser(args);
+
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+ return 0;
+
+ error0:
+ trace_xfs_alloc_near_error(args);
+ if (cnt_cur != NULL)
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+ if (bno_cur_lt != NULL)
+ xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
+ if (bno_cur_gt != NULL)
+ xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Allocate a variable extent anywhere in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent_size(
+ xfs_alloc_arg_t *args) /* allocation argument structure */
+{
+ xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
+ xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
+ int error; /* error result */
+ xfs_agblock_t fbno; /* start of found freespace */
+ xfs_extlen_t flen; /* length of found freespace */
+ int i; /* temp status variable */
+ xfs_agblock_t rbno; /* returned block number */
+ xfs_extlen_t rlen; /* length of returned extent */
+ int forced = 0;
+
+restart:
+ /*
+ * Allocate and initialize a cursor for the by-size btree.
+ */
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_CNT);
+ bno_cur = NULL;
+
+ /*
+ * Look for an entry >= maxlen+alignment-1 blocks.
+ */
+ if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
+ args->maxlen + args->alignment - 1, &i)))
+ goto error0;
+
+ /*
+ * If none or we have busy extents that we cannot allocate from, then
+ * we have to settle for a smaller extent. In the case that there are
+ * no large extents, this will return the last entry in the tree unless
+ * the tree is empty. In the case that there are only busy large
+ * extents, this will return the largest small extent unless there
+ * are no smaller extents available.
+ */
+ if (!i || forced > 1) {
+ error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+ &fbno, &flen, &i);
+ if (error)
+ goto error0;
+ if (i == 0 || flen == 0) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_noentry(args);
+ return 0;
+ }
+ ASSERT(i == 1);
+ xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+ } else {
+ /*
+ * Search for a non-busy extent that is large enough.
+ * If we are at low space, don't check, or if we fall of
+ * the end of the btree, turn off the busy check and
+ * restart.
+ */
+ for (;;) {
+ error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ xfs_alloc_compute_aligned(args, fbno, flen,
+ &rbno, &rlen);
+
+ if (rlen >= args->maxlen)
+ break;
+
+ error = xfs_btree_increment(cnt_cur, 0, &i);
+ if (error)
+ goto error0;
+ if (i == 0) {
+ /*
+ * Our only valid extents must have been busy.
+ * Make it unbusy by forcing the log out and
+ * retrying. If we've been here before, forcing
+ * the log isn't making the extents available,
+ * which means they have probably been freed in
+ * this transaction. In that case, we have to
+ * give up on them and we'll attempt a minlen
+ * allocation the next time around.
+ */
+ xfs_btree_del_cursor(cnt_cur,
+ XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_busy(args);
+ if (!forced++)
+ xfs_log_force(args->mp, XFS_LOG_SYNC);
+ goto restart;
+ }
+ }
+ }
+
+ /*
+ * In the first case above, we got the last entry in the
+ * by-size btree. Now we check to see if the space hits maxlen
+ * once aligned; if not, we search left for something better.
+ * This can't happen in the second case above.
+ */
+ rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+ XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+ (rlen <= flen && rbno + rlen <= fbno + flen), error0);
+ if (rlen < args->maxlen) {
+ xfs_agblock_t bestfbno;
+ xfs_extlen_t bestflen;
+ xfs_agblock_t bestrbno;
+ xfs_extlen_t bestrlen;
+
+ bestrlen = rlen;
+ bestrbno = rbno;
+ bestflen = flen;
+ bestfbno = fbno;
+ for (;;) {
+ if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
+ goto error0;
+ if (i == 0)
+ break;
+ if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
+ &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if (flen < bestrlen)
+ break;
+ xfs_alloc_compute_aligned(args, fbno, flen,
+ &rbno, &rlen);
+ rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+ XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+ (rlen <= flen && rbno + rlen <= fbno + flen),
+ error0);
+ if (rlen > bestrlen) {
+ bestrlen = rlen;
+ bestrbno = rbno;
+ bestflen = flen;
+ bestfbno = fbno;
+ if (rlen == args->maxlen)
+ break;
+ }
+ }
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
+ &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ rlen = bestrlen;
+ rbno = bestrbno;
+ flen = bestflen;
+ fbno = bestfbno;
+ }
+ args->wasfromfl = 0;
+ /*
+ * Fix up the length.
+ */
+ args->len = rlen;
+ if (rlen < args->minlen) {
+ if (!forced++) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_busy(args);
+ xfs_log_force(args->mp, XFS_LOG_SYNC);
+ goto restart;
+ }
+ goto out_nominleft;
+ }
+ xfs_alloc_fix_len(args);
+
+ if (!xfs_alloc_fix_minleft(args))
+ goto out_nominleft;
+ rlen = args->len;
+ XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+ /*
+ * Allocate and initialize a cursor for the by-block tree.
+ */
+ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_BNO);
+ if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+ rbno, rlen, XFSA_FIXUP_CNT_OK)))
+ goto error0;
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+ cnt_cur = bno_cur = NULL;
+ args->len = rlen;
+ args->agbno = rbno;
+ XFS_WANT_CORRUPTED_GOTO(
+ args->agbno + args->len <=
+ be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+ error0);
+ trace_xfs_alloc_size_done(args);
+ return 0;
+
+error0:
+ trace_xfs_alloc_size_error(args);
+ if (cnt_cur)
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+ if (bno_cur)
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+ return error;
+
+out_nominleft:
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_nominleft(args);
+ args->agbno = NULLAGBLOCK;
+ return 0;
+}
+
+/*
+ * Deal with the case where only small freespaces remain.
+ * Either return the contents of the last freespace record,
+ * or allocate space from the freelist if there is nothing in the tree.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent_small(
+ xfs_alloc_arg_t *args, /* allocation argument structure */
+ xfs_btree_cur_t *ccur, /* by-size cursor */
+ xfs_agblock_t *fbnop, /* result block number */
+ xfs_extlen_t *flenp, /* result length */
+ int *stat) /* status: 0-freelist, 1-normal/none */
+{
+ int error;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int i;
+
+ if ((error = xfs_btree_decrement(ccur, 0, &i)))
+ goto error0;
+ if (i) {
+ if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ }
+ /*
+ * Nothing in the btree, try the freelist. Make sure
+ * to respect minleft even when pulling from the
+ * freelist.
+ */
+ else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+ (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
+ > args->minleft)) {
+ error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+ if (error)
+ goto error0;
+ if (fbno != NULLAGBLOCK) {
+ xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
+ args->userdata);
+
+ if (args->userdata) {
+ xfs_buf_t *bp;
+
+ bp = xfs_btree_get_bufs(args->mp, args->tp,
+ args->agno, fbno, 0);
+ xfs_trans_binval(args->tp, bp);
+ }
+ args->len = 1;
+ args->agbno = fbno;
+ XFS_WANT_CORRUPTED_GOTO(
+ args->agbno + args->len <=
+ be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+ error0);
+ args->wasfromfl = 1;
+ trace_xfs_alloc_small_freelist(args);
+ *stat = 0;
+ return 0;
+ }
+ /*
+ * Nothing in the freelist.
+ */
+ else
+ flen = 0;
+ }
+ /*
+ * Can't allocate from the freelist for some reason.
+ */
+ else {
+ fbno = NULLAGBLOCK;
+ flen = 0;
+ }
+ /*
+ * Can't do the allocation, give up.
+ */
+ if (flen < args->minlen) {
+ args->agbno = NULLAGBLOCK;
+ trace_xfs_alloc_small_notenough(args);
+ flen = 0;
+ }
+ *fbnop = fbno;
+ *flenp = flen;
+ *stat = 1;
+ trace_xfs_alloc_small_done(args);
+ return 0;
+
+error0:
+ trace_xfs_alloc_small_error(args);
+ return error;
+}
+
+/*
+ * Free the extent starting at agno/bno for length.
+ */
+STATIC int /* error */
+xfs_free_ag_extent(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_buf_t *agbp, /* buffer for a.g. freelist header */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_agblock_t bno, /* starting block number */
+ xfs_extlen_t len, /* length of extent */
+ int isfl) /* set if is freelist blocks - no sb acctg */
+{
+ xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
+ xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
+ int error; /* error return value */
+ xfs_agblock_t gtbno; /* start of right neighbor block */
+ xfs_extlen_t gtlen; /* length of right neighbor block */
+ int haveleft; /* have a left neighbor block */
+ int haveright; /* have a right neighbor block */
+ int i; /* temp, result code */
+ xfs_agblock_t ltbno; /* start of left neighbor block */
+ xfs_extlen_t ltlen; /* length of left neighbor block */
+ xfs_mount_t *mp; /* mount point struct for filesystem */
+ xfs_agblock_t nbno; /* new starting block of freespace */
+ xfs_extlen_t nlen; /* new length of freespace */
+ xfs_perag_t *pag; /* per allocation group data */
+
+ mp = tp->t_mountp;
+ /*
+ * Allocate and initialize a cursor for the by-block btree.
+ */
+ bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
+ cnt_cur = NULL;
+ /*
+ * Look for a neighboring block on the left (lower block numbers)
+ * that is contiguous with this space.
+ */
+ if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
+ goto error0;
+ if (haveleft) {
+ /*
+ * There is a block to our left.
+ */
+ if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ /*
+ * It's not contiguous, though.
+ */
+ if (ltbno + ltlen < bno)
+ haveleft = 0;
+ else {
+ /*
+ * If this failure happens the request to free this
+ * space was invalid, it's (partly) already free.
+ * Very bad.
+ */
+ XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+ }
+ }
+ /*
+ * Look for a neighboring block on the right (higher block numbers)
+ * that is contiguous with this space.
+ */
+ if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
+ goto error0;
+ if (haveright) {
+ /*
+ * There is a block to our right.
+ */
+ if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ /*
+ * It's not contiguous, though.
+ */
+ if (bno + len < gtbno)
+ haveright = 0;
+ else {
+ /*
+ * If this failure happens the request to free this
+ * space was invalid, it's (partly) already free.
+ * Very bad.
+ */
+ XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+ }
+ }
+ /*
+ * Now allocate and initialize a cursor for the by-size tree.
+ */
+ cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
+ /*
+ * Have both left and right contiguous neighbors.
+ * Merge all three into a single free block.
+ */
+ if (haveleft && haveright) {
+ /*
+ * Delete the old by-size entry on the left.
+ */
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ /*
+ * Delete the old by-size entry on the right.
+ */
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ /*
+ * Delete the old by-block entry for the right block.
+ */
+ if ((error = xfs_btree_delete(bno_cur, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ /*
+ * Move the by-block cursor back to the left neighbor.
+ */
+ if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+#ifdef DEBUG
+ /*
+ * Check that this is the right record: delete didn't
+ * mangle the cursor.
+ */
+ {
+ xfs_agblock_t xxbno;
+ xfs_extlen_t xxlen;
+
+ if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
+ &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(
+ i == 1 && xxbno == ltbno && xxlen == ltlen,
+ error0);
+ }
+#endif
+ /*
+ * Update remaining by-block entry to the new, joined block.
+ */
+ nbno = ltbno;
+ nlen = len + ltlen + gtlen;
+ if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+ goto error0;
+ }
+ /*
+ * Have only a left contiguous neighbor.
+ * Merge it together with the new freespace.
+ */
+ else if (haveleft) {
+ /*
+ * Delete the old by-size entry on the left.
+ */
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ /*
+ * Back up the by-block cursor to the left neighbor, and
+ * update its length.
+ */
+ if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ nbno = ltbno;
+ nlen = len + ltlen;
+ if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+ goto error0;
+ }
+ /*
+ * Have only a right contiguous neighbor.
+ * Merge it together with the new freespace.
+ */
+ else if (haveright) {
+ /*
+ * Delete the old by-size entry on the right.
+ */
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ /*
+ * Update the starting block and length of the right
+ * neighbor in the by-block tree.
+ */
+ nbno = bno;
+ nlen = len + gtlen;
+ if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+ goto error0;
+ }
+ /*
+ * No contiguous neighbors.
+ * Insert the new freespace into the by-block tree.
+ */
+ else {
+ nbno = bno;
+ nlen = len;
+ if ((error = xfs_btree_insert(bno_cur, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ }
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+ bno_cur = NULL;
+ /*
+ * In all cases we need to insert the new freespace in the by-size tree.
+ */
+ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ cnt_cur = NULL;
+
+ /*
+ * Update the freespace totals in the ag and superblock.
+ */
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_alloc_update_counters(tp, pag, agbp, len);
+ xfs_perag_put(pag);
+ if (error)
+ goto error0;
+
+ if (!isfl)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+ XFS_STATS_INC(xs_freex);
+ XFS_STATS_ADD(xs_freeb, len);
+
+ trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+
+ return 0;
+
+ error0:
+ trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
+ if (bno_cur)
+ xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+ if (cnt_cur)
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Visible (exported) allocation/free functions.
+ * Some of these are used just by xfs_alloc_btree.c and this file.
+ */
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+ xfs_mount_t *mp) /* file system mount structure */
+{
+ int level;
+ uint maxblocks;
+ uint maxleafents;
+ int minleafrecs;
+ int minnoderecs;
+
+ maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
+ minleafrecs = mp->m_alloc_mnr[0];
+ minnoderecs = mp->m_alloc_mnr[1];
+ maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+ for (level = 1; maxblocks > 1; level++)
+ maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+ mp->m_ag_maxlevels = level;
+}
+
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag)
+{
+ xfs_extlen_t need, delta = 0;
+
+ need = XFS_MIN_FREELIST_PAG(pag, mp);
+ if (need > pag->pagf_flcount)
+ delta = need - pag->pagf_flcount;
+
+ if (pag->pagf_longest > delta)
+ return pag->pagf_longest - delta;
+ return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ */
+STATIC int /* error */
+xfs_alloc_fix_freelist(
+ xfs_alloc_arg_t *args, /* allocation argument structure */
+ int flags) /* XFS_ALLOC_FLAG_... */
+{
+ xfs_buf_t *agbp; /* agf buffer pointer */
+ xfs_agf_t *agf; /* a.g. freespace structure pointer */
+ xfs_buf_t *agflbp;/* agfl buffer pointer */
+ xfs_agblock_t bno; /* freelist block */
+ xfs_extlen_t delta; /* new blocks needed in freelist */
+ int error; /* error result code */
+ xfs_extlen_t longest;/* longest extent in allocation group */
+ xfs_mount_t *mp; /* file system mount point structure */
+ xfs_extlen_t need; /* total blocks needed in freelist */
+ xfs_perag_t *pag; /* per-ag information structure */
+ xfs_alloc_arg_t targs; /* local allocation arguments */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ mp = args->mp;
+
+ pag = args->pag;
+ tp = args->tp;
+ if (!pag->pagf_init) {
+ if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+ &agbp)))
+ return error;
+ if (!pag->pagf_init) {
+ ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+ ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+ args->agbp = NULL;
+ return 0;
+ }
+ } else
+ agbp = NULL;
+
+ /*
+ * If this is a metadata preferred pag and we are user data
+ * then try somewhere else if we are not being asked to
+ * try harder at this point
+ */
+ if (pag->pagf_metadata && args->userdata &&
+ (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
+ ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+ args->agbp = NULL;
+ return 0;
+ }
+
+ if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+ /*
+ * If it looks like there isn't a long enough extent, or enough
+ * total blocks, reject it.
+ */
+ need = XFS_MIN_FREELIST_PAG(pag, mp);
+ longest = xfs_alloc_longest_free_extent(mp, pag);
+ if ((args->minlen + args->alignment + args->minalignslop - 1) >
+ longest ||
+ ((int)(pag->pagf_freeblks + pag->pagf_flcount -
+ need - args->total) < (int)args->minleft)) {
+ if (agbp)
+ xfs_trans_brelse(tp, agbp);
+ args->agbp = NULL;
+ return 0;
+ }
+ }
+
+ /*
+ * Get the a.g. freespace buffer.
+ * Can fail if we're not blocking on locks, and it's held.
+ */
+ if (agbp == NULL) {
+ if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+ &agbp)))
+ return error;
+ if (agbp == NULL) {
+ ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+ ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+ args->agbp = NULL;
+ return 0;
+ }
+ }
+ /*
+ * Figure out how many blocks we should have in the freelist.
+ */
+ agf = XFS_BUF_TO_AGF(agbp);
+ need = XFS_MIN_FREELIST(agf, mp);
+ /*
+ * If there isn't enough total or single-extent, reject it.
+ */
+ if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+ delta = need > be32_to_cpu(agf->agf_flcount) ?
+ (need - be32_to_cpu(agf->agf_flcount)) : 0;
+ longest = be32_to_cpu(agf->agf_longest);
+ longest = (longest > delta) ? (longest - delta) :
+ (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
+ if ((args->minlen + args->alignment + args->minalignslop - 1) >
+ longest ||
+ ((int)(be32_to_cpu(agf->agf_freeblks) +
+ be32_to_cpu(agf->agf_flcount) - need - args->total) <
+ (int)args->minleft)) {
+ xfs_trans_brelse(tp, agbp);
+ args->agbp = NULL;
+ return 0;
+ }
+ }
+ /*
+ * Make the freelist shorter if it's too long.
+ */
+ while (be32_to_cpu(agf->agf_flcount) > need) {
+ xfs_buf_t *bp;
+
+ error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+ if (error)
+ return error;
+ if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
+ return error;
+ bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+ xfs_trans_binval(tp, bp);
+ }
+ /*
+ * Initialize the args structure.
+ */
+ targs.tp = tp;
+ targs.mp = mp;
+ targs.agbp = agbp;
+ targs.agno = args->agno;
+ targs.mod = targs.minleft = targs.wasdel = targs.userdata =
+ targs.minalignslop = 0;
+ targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+ targs.type = XFS_ALLOCTYPE_THIS_AG;
+ targs.pag = pag;
+ if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
+ return error;
+ /*
+ * Make the freelist longer if it's too short.
+ */
+ while (be32_to_cpu(agf->agf_flcount) < need) {
+ targs.agbno = 0;
+ targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
+ /*
+ * Allocate as many blocks as possible at once.
+ */
+ if ((error = xfs_alloc_ag_vextent(&targs))) {
+ xfs_trans_brelse(tp, agflbp);
+ return error;
+ }
+ /*
+ * Stop if we run out. Won't happen if callers are obeying
+ * the restrictions correctly. Can happen for free calls
+ * on a completely full ag.
+ */
+ if (targs.agbno == NULLAGBLOCK) {
+ if (flags & XFS_ALLOC_FLAG_FREEING)
+ break;
+ xfs_trans_brelse(tp, agflbp);
+ args->agbp = NULL;
+ return 0;
+ }
+ /*
+ * Put each allocated block on the list.
+ */
+ for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
+ error = xfs_alloc_put_freelist(tp, agbp,
+ agflbp, bno, 0);
+ if (error)
+ return error;
+ }
+ }
+ xfs_trans_brelse(tp, agflbp);
+ args->agbp = agbp;
+ return 0;
+}
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int /* error */
+xfs_alloc_get_freelist(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_buf_t *agbp, /* buffer containing the agf structure */
+ xfs_agblock_t *bnop, /* block address retrieved from freelist */
+ int btreeblk) /* destination is a AGF btree */
+{
+ xfs_agf_t *agf; /* a.g. freespace structure */
+ xfs_agfl_t *agfl; /* a.g. freelist structure */
+ xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
+ xfs_agblock_t bno; /* block number returned */
+ int error;
+ int logflags;
+ xfs_mount_t *mp; /* mount structure */
+ xfs_perag_t *pag; /* per allocation group data */
+
+ agf = XFS_BUF_TO_AGF(agbp);
+ /*
+ * Freelist is empty, give up.
+ */
+ if (!agf->agf_flcount) {
+ *bnop = NULLAGBLOCK;
+ return 0;
+ }
+ /*
+ * Read the array of free blocks.
+ */
+ mp = tp->t_mountp;
+ if ((error = xfs_alloc_read_agfl(mp, tp,
+ be32_to_cpu(agf->agf_seqno), &agflbp)))
+ return error;
+ agfl = XFS_BUF_TO_AGFL(agflbp);
+ /*
+ * Get the block number and update the data structures.
+ */
+ bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+ be32_add_cpu(&agf->agf_flfirst, 1);
+ xfs_trans_brelse(tp, agflbp);
+ if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
+ agf->agf_flfirst = 0;
+
+ pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+ be32_add_cpu(&agf->agf_flcount, -1);
+ xfs_trans_agflist_delta(tp, -1);
+ pag->pagf_flcount--;
+ xfs_perag_put(pag);
+
+ logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
+ if (btreeblk) {
+ be32_add_cpu(&agf->agf_btreeblks, 1);
+ pag->pagf_btreeblks++;
+ logflags |= XFS_AGF_BTREEBLKS;
+ }
+
+ xfs_alloc_log_agf(tp, agbp, logflags);
+ *bnop = bno;
+
+ return 0;
+}
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_buf_t *bp, /* buffer for a.g. freelist header */
+ int fields) /* mask of fields to be logged (XFS_AGF_...) */
+{
+ int first; /* first byte offset */
+ int last; /* last byte offset */
+ static const short offsets[] = {
+ offsetof(xfs_agf_t, agf_magicnum),
+ offsetof(xfs_agf_t, agf_versionnum),
+ offsetof(xfs_agf_t, agf_seqno),
+ offsetof(xfs_agf_t, agf_length),
+ offsetof(xfs_agf_t, agf_roots[0]),
+ offsetof(xfs_agf_t, agf_levels[0]),
+ offsetof(xfs_agf_t, agf_flfirst),
+ offsetof(xfs_agf_t, agf_fllast),
+ offsetof(xfs_agf_t, agf_flcount),
+ offsetof(xfs_agf_t, agf_freeblks),
+ offsetof(xfs_agf_t, agf_longest),
+ offsetof(xfs_agf_t, agf_btreeblks),
+ sizeof(xfs_agf_t)
+ };
+
+ trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+
+ xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
+ xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
+}
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int /* error */
+xfs_alloc_pagf_init(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ int flags) /* XFS_ALLOC_FLAGS_... */
+{
+ xfs_buf_t *bp;
+ int error;
+
+ if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
+ return error;
+ if (bp)
+ xfs_trans_brelse(tp, bp);
+ return 0;
+}
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int /* error */
+xfs_alloc_put_freelist(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_buf_t *agbp, /* buffer for a.g. freelist header */
+ xfs_buf_t *agflbp,/* buffer for a.g. free block array */
+ xfs_agblock_t bno, /* block being freed */
+ int btreeblk) /* block came from a AGF btree */
+{
+ xfs_agf_t *agf; /* a.g. freespace structure */
+ xfs_agfl_t *agfl; /* a.g. free block array */
+ __be32 *blockp;/* pointer to array entry */
+ int error;
+ int logflags;
+ xfs_mount_t *mp; /* mount structure */
+ xfs_perag_t *pag; /* per allocation group data */
+
+ agf = XFS_BUF_TO_AGF(agbp);
+ mp = tp->t_mountp;
+
+ if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
+ be32_to_cpu(agf->agf_seqno), &agflbp)))
+ return error;
+ agfl = XFS_BUF_TO_AGFL(agflbp);
+ be32_add_cpu(&agf->agf_fllast, 1);
+ if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
+ agf->agf_fllast = 0;
+
+ pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+ be32_add_cpu(&agf->agf_flcount, 1);
+ xfs_trans_agflist_delta(tp, 1);
+ pag->pagf_flcount++;
+
+ logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
+ if (btreeblk) {
+ be32_add_cpu(&agf->agf_btreeblks, -1);
+ pag->pagf_btreeblks--;
+ logflags |= XFS_AGF_BTREEBLKS;
+ }
+ xfs_perag_put(pag);
+
+ xfs_alloc_log_agf(tp, agbp, logflags);
+
+ ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
+ blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
+ *blockp = cpu_to_be32(bno);
+ xfs_alloc_log_agf(tp, agbp, logflags);
+ xfs_trans_log_buf(tp, agflbp,
+ (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
+ (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
+ sizeof(xfs_agblock_t) - 1));
+ return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int /* error */
+xfs_read_agf(
+ struct xfs_mount *mp, /* mount point structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ int flags, /* XFS_BUF_ */
+ struct xfs_buf **bpp) /* buffer for the ag freelist header */
+{
+ struct xfs_agf *agf; /* ag freelist header */
+ int agf_ok; /* set if agf is consistent */
+ int error;
+
+ ASSERT(agno != NULLAGNUMBER);
+ error = xfs_trans_read_buf(
+ mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), flags, bpp);
+ if (error)
+ return error;
+ if (!*bpp)
+ return 0;
+
+ ASSERT(!XFS_BUF_GETERROR(*bpp));
+ agf = XFS_BUF_TO_AGF(*bpp);
+
+ /*
+ * Validate the magic number of the agf block.
+ */
+ agf_ok =
+ be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC &&
+ XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+ be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_seqno) == agno;
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+ be32_to_cpu(agf->agf_length);
+ if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+ XFS_RANDOM_ALLOC_READ_AGF))) {
+ XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
+ XFS_ERRLEVEL_LOW, mp, agf);
+ xfs_trans_brelse(tp, *bpp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
+ return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int /* error */
+xfs_alloc_read_agf(
+ struct xfs_mount *mp, /* mount point structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ int flags, /* XFS_ALLOC_FLAG_... */
+ struct xfs_buf **bpp) /* buffer for the ag freelist header */
+{
+ struct xfs_agf *agf; /* ag freelist header */
+ struct xfs_perag *pag; /* per allocation group data */
+ int error;
+
+ ASSERT(agno != NULLAGNUMBER);
+
+ error = xfs_read_agf(mp, tp, agno,
+ (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+ bpp);
+ if (error)
+ return error;
+ if (!*bpp)
+ return 0;
+ ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+ agf = XFS_BUF_TO_AGF(*bpp);
+ pag = xfs_perag_get(mp, agno);
+ if (!pag->pagf_init) {
+ pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+ pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+ pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
+ pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+ pag->pagf_levels[XFS_BTNUM_BNOi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+ pag->pagf_levels[XFS_BTNUM_CNTi] =
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+ spin_lock_init(&pag->pagb_lock);
+ pag->pagb_count = 0;
+ pag->pagb_tree = RB_ROOT;
+ pag->pagf_init = 1;
+ }
+#ifdef DEBUG
+ else if (!XFS_FORCED_SHUTDOWN(mp)) {
+ ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+ ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
+ ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
+ ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
+ ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
+ ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+ }
+#endif
+ xfs_perag_put(pag);
+ return 0;
+}
+
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int /* error */
+xfs_alloc_vextent(
+ xfs_alloc_arg_t *args) /* allocation argument structure */
+{
+ xfs_agblock_t agsize; /* allocation group size */
+ int error;
+ int flags; /* XFS_ALLOC_FLAG_... locking flags */
+ xfs_extlen_t minleft;/* minimum left value, temp copy */
+ xfs_mount_t *mp; /* mount structure pointer */
+ xfs_agnumber_t sagno; /* starting allocation group number */
+ xfs_alloctype_t type; /* input allocation type */
+ int bump_rotor = 0;
+ int no_min = 0;
+ xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
+
+ mp = args->mp;
+ type = args->otype = args->type;
+ args->agbno = NULLAGBLOCK;
+ /*
+ * Just fix this up, for the case where the last a.g. is shorter
+ * (or there's only one a.g.) and the caller couldn't easily figure
+ * that out (xfs_bmap_alloc).
+ */
+ agsize = mp->m_sb.sb_agblocks;
+ if (args->maxlen > agsize)
+ args->maxlen = agsize;
+ if (args->alignment == 0)
+ args->alignment = 1;
+ ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+ ASSERT(args->minlen <= args->maxlen);
+ ASSERT(args->minlen <= agsize);
+ ASSERT(args->mod < args->prod);
+ if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
+ XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+ args->minlen > args->maxlen || args->minlen > agsize ||
+ args->mod >= args->prod) {
+ args->fsbno = NULLFSBLOCK;
+ trace_xfs_alloc_vextent_badargs(args);
+ return 0;
+ }
+ minleft = args->minleft;
+
+ switch (type) {
+ case XFS_ALLOCTYPE_THIS_AG:
+ case XFS_ALLOCTYPE_NEAR_BNO:
+ case XFS_ALLOCTYPE_THIS_BNO:
+ /*
+ * These three force us into a single a.g.
+ */
+ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ args->pag = xfs_perag_get(mp, args->agno);
+ args->minleft = 0;
+ error = xfs_alloc_fix_freelist(args, 0);
+ args->minleft = minleft;
+ if (error) {
+ trace_xfs_alloc_vextent_nofix(args);
+ goto error0;
+ }
+ if (!args->agbp) {
+ trace_xfs_alloc_vextent_noagbp(args);
+ break;
+ }
+ args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+ if ((error = xfs_alloc_ag_vextent(args)))
+ goto error0;
+ break;
+ case XFS_ALLOCTYPE_START_BNO:
+ /*
+ * Try near allocation first, then anywhere-in-ag after
+ * the first a.g. fails.
+ */
+ if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) &&
+ (mp->m_flags & XFS_MOUNT_32BITINODES)) {
+ args->fsbno = XFS_AGB_TO_FSB(mp,
+ ((mp->m_agfrotor / rotorstep) %
+ mp->m_sb.sb_agcount), 0);
+ bump_rotor = 1;
+ }
+ args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
+ /* FALLTHROUGH */
+ case XFS_ALLOCTYPE_ANY_AG:
+ case XFS_ALLOCTYPE_START_AG:
+ case XFS_ALLOCTYPE_FIRST_AG:
+ /*
+ * Rotate through the allocation groups looking for a winner.
+ */
+ if (type == XFS_ALLOCTYPE_ANY_AG) {
+ /*
+ * Start with the last place we left off.
+ */
+ args->agno = sagno = (mp->m_agfrotor / rotorstep) %
+ mp->m_sb.sb_agcount;
+ args->type = XFS_ALLOCTYPE_THIS_AG;
+ flags = XFS_ALLOC_FLAG_TRYLOCK;
+ } else if (type == XFS_ALLOCTYPE_FIRST_AG) {
+ /*
+ * Start with allocation group given by bno.
+ */
+ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ args->type = XFS_ALLOCTYPE_THIS_AG;
+ sagno = 0;
+ flags = 0;
+ } else {
+ if (type == XFS_ALLOCTYPE_START_AG)
+ args->type = XFS_ALLOCTYPE_THIS_AG;
+ /*
+ * Start with the given allocation group.
+ */
+ args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ flags = XFS_ALLOC_FLAG_TRYLOCK;
+ }
+ /*
+ * Loop over allocation groups twice; first time with
+ * trylock set, second time without.
+ */
+ for (;;) {
+ args->pag = xfs_perag_get(mp, args->agno);
+ if (no_min) args->minleft = 0;
+ error = xfs_alloc_fix_freelist(args, flags);
+ args->minleft = minleft;
+ if (error) {
+ trace_xfs_alloc_vextent_nofix(args);
+ goto error0;
+ }
+ /*
+ * If we get a buffer back then the allocation will fly.
+ */
+ if (args->agbp) {
+ if ((error = xfs_alloc_ag_vextent(args)))
+ goto error0;
+ break;
+ }
+
+ trace_xfs_alloc_vextent_loopfailed(args);
+
+ /*
+ * Didn't work, figure out the next iteration.
+ */
+ if (args->agno == sagno &&
+ type == XFS_ALLOCTYPE_START_BNO)
+ args->type = XFS_ALLOCTYPE_THIS_AG;
+ /*
+ * For the first allocation, we can try any AG to get
+ * space. However, if we already have allocated a
+ * block, we don't want to try AGs whose number is below
+ * sagno. Otherwise, we may end up with out-of-order
+ * locking of AGF, which might cause deadlock.
+ */
+ if (++(args->agno) == mp->m_sb.sb_agcount) {
+ if (args->firstblock != NULLFSBLOCK)
+ args->agno = sagno;
+ else
+ args->agno = 0;
+ }
+ /*
+ * Reached the starting a.g., must either be done
+ * or switch to non-trylock mode.
+ */
+ if (args->agno == sagno) {
+ if (no_min == 1) {
+ args->agbno = NULLAGBLOCK;
+ trace_xfs_alloc_vextent_allfailed(args);
+ break;
+ }
+ if (flags == 0) {
+ no_min = 1;
+ } else {
+ flags = 0;
+ if (type == XFS_ALLOCTYPE_START_BNO) {
+ args->agbno = XFS_FSB_TO_AGBNO(mp,
+ args->fsbno);
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
+ }
+ }
+ }
+ xfs_perag_put(args->pag);
+ }
+ if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
+ if (args->agno == sagno)
+ mp->m_agfrotor = (mp->m_agfrotor + 1) %
+ (mp->m_sb.sb_agcount * rotorstep);
+ else
+ mp->m_agfrotor = (args->agno * rotorstep + 1) %
+ (mp->m_sb.sb_agcount * rotorstep);
+ }
+ break;
+ default:
+ ASSERT(0);
+ /* NOTREACHED */
+ }
+ if (args->agbno == NULLAGBLOCK)
+ args->fsbno = NULLFSBLOCK;
+ else {
+ args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+ ASSERT(args->len >= args->minlen);
+ ASSERT(args->len <= args->maxlen);
+ ASSERT(args->agbno % args->alignment == 0);
+ XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
+ args->len);
+#endif
+ }
+ xfs_perag_put(args->pag);
+ return 0;
+error0:
+ xfs_perag_put(args->pag);
+ return error;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int /* error */
+xfs_free_extent(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_fsblock_t bno, /* starting block number of extent */
+ xfs_extlen_t len) /* length of extent */
+{
+ xfs_alloc_arg_t args;
+ int error;
+
+ ASSERT(len != 0);
+ memset(&args, 0, sizeof(xfs_alloc_arg_t));
+ args.tp = tp;
+ args.mp = tp->t_mountp;
+
+ /*
+ * validate that the block number is legal - the enables us to detect
+ * and handle a silent filesystem corruption rather than crashing.
+ */
+ args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
+ if (args.agno >= args.mp->m_sb.sb_agcount)
+ return EFSCORRUPTED;
+
+ args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+ if (args.agbno >= args.mp->m_sb.sb_agblocks)
+ return EFSCORRUPTED;
+
+ args.pag = xfs_perag_get(args.mp, args.agno);
+ ASSERT(args.pag);
+
+ error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+ if (error)
+ goto error0;
+
+ /* validate the extent size is legal now we have the agf locked */
+ if (args.agbno + len >
+ be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
+ error = EFSCORRUPTED;
+ goto error0;
+ }
+
+ error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+ if (!error)
+ xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
+error0:
+ xfs_perag_put(args.pag);
+ return error;
+}
+
+void
+xfs_alloc_busy_insert(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ unsigned int flags)
+{
+ struct xfs_busy_extent *new;
+ struct xfs_busy_extent *busyp;
+ struct xfs_perag *pag;
+ struct rb_node **rbp;
+ struct rb_node *parent = NULL;
+
+ new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
+ if (!new) {
+ /*
+ * No Memory! Since it is now not possible to track the free
+ * block, make this a synchronous transaction to insure that
+ * the block is not reused before this transaction commits.
+ */
+ trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
+ xfs_trans_set_sync(tp);
+ return;
+ }
+
+ new->agno = agno;
+ new->bno = bno;
+ new->length = len;
+ INIT_LIST_HEAD(&new->list);
+ new->flags = flags;
+
+ /* trace before insert to be able to see failed inserts */
+ trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
+
+ pag = xfs_perag_get(tp->t_mountp, new->agno);
+ spin_lock(&pag->pagb_lock);
+ rbp = &pag->pagb_tree.rb_node;
+ while (*rbp) {
+ parent = *rbp;
+ busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
+
+ if (new->bno < busyp->bno) {
+ rbp = &(*rbp)->rb_left;
+ ASSERT(new->bno + new->length <= busyp->bno);
+ } else if (new->bno > busyp->bno) {
+ rbp = &(*rbp)->rb_right;
+ ASSERT(bno >= busyp->bno + busyp->length);
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ rb_link_node(&new->rb_node, parent, rbp);
+ rb_insert_color(&new->rb_node, &pag->pagb_tree);
+
+ list_add(&new->list, &tp->t_busy);
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+}
+
+/*
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate. You need to be holding the busy extent tree lock when calling
+ * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+int
+xfs_alloc_busy_search(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len)
+{
+ struct xfs_perag *pag;
+ struct rb_node *rbp;
+ struct xfs_busy_extent *busyp;
+ int match = 0;
+
+ pag = xfs_perag_get(mp, agno);
+ spin_lock(&pag->pagb_lock);
+
+ rbp = pag->pagb_tree.rb_node;
+
+ /* find closest start bno overlap */
+ while (rbp) {
+ busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+ if (bno < busyp->bno) {
+ /* may overlap, but exact start block is lower */
+ if (bno + len > busyp->bno)
+ match = -1;
+ rbp = rbp->rb_left;
+ } else if (bno > busyp->bno) {
+ /* may overlap, but exact start block is higher */
+ if (bno < busyp->bno + busyp->length)
+ match = -1;
+ rbp = rbp->rb_right;
+ } else {
+ /* bno matches busyp, length determines exact match */
+ match = (busyp->length == len) ? 1 : -1;
+ break;
+ }
+ }
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ return match;
+}
+
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent. If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation. We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entries block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_alloc_busy_update_extent(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ struct xfs_busy_extent *busyp,
+ xfs_agblock_t fbno,
+ xfs_extlen_t flen,
+ bool userdata)
+{
+ xfs_agblock_t fend = fbno + flen;
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ /*
+ * This extent is currently being discarded. Give the thread
+ * performing the discard a chance to mark the extent unbusy
+ * and retry.
+ */
+ if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
+ spin_unlock(&pag->pagb_lock);
+ delay(1);
+ spin_lock(&pag->pagb_lock);
+ return false;
+ }
+
+ /*
+ * If there is a busy extent overlapping a user allocation, we have
+ * no choice but to force the log and retry the search.
+ *
+ * Fortunately this does not happen during normal operation, but
+ * only if the filesystem is very low on space and has to dip into
+ * the AGFL for normal allocations.
+ */
+ if (userdata)
+ goto out_force_log;
+
+ if (bbno < fbno && bend > fend) {
+ /*
+ * Case 1:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ */
+
+ /*
+ * We would have to split the busy extent to be able to track
+ * it correct, which we cannot do because we would have to
+ * modify the list of busy extents attached to the transaction
+ * or CIL context, which is immutable.
+ *
+ * Force out the log to clear the busy extent and retry the
+ * search.
+ */
+ goto out_force_log;
+ } else if (bbno >= fbno && bend <= fend) {
+ /*
+ * Case 2:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------+
+ * fbno fend
+ *
+ * Case 3:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Case 4:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Case 5:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------------------------+
+ * fbno fend
+ *
+ */
+
+ /*
+ * The busy extent is fully covered by the extent we are
+ * allocating, and can simply be removed from the rbtree.
+ * However we cannot remove it from the immutable list
+ * tracking busy extents in the transaction or CIL context,
+ * so set the length to zero to mark it invalid.
+ *
+ * We also need to restart the busy extent search from the
+ * tree root, because erasing the node can rearrange the
+ * tree topology.
+ */
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ busyp->length = 0;
+ return false;
+ } else if (fend < bend) {
+ /*
+ * Case 6:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ *
+ * Case 7:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +------------------+
+ * fbno fend
+ *
+ */
+ busyp->bno = fend;
+ } else if (bbno < fbno) {
+ /*
+ * Case 8:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 9:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +----------------------+
+ * fbno fend
+ */
+ busyp->length = fbno - busyp->bno;
+ } else {
+ ASSERT(0);
+ }
+
+ trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
+ return true;
+
+out_force_log:
+ spin_unlock(&pag->pagb_lock);
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
+ spin_lock(&pag->pagb_lock);
+ return false;
+}
+
+
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
+void
+xfs_alloc_busy_reuse(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t fbno,
+ xfs_extlen_t flen,
+ bool userdata)
+{
+ struct xfs_perag *pag;
+ struct rb_node *rbp;
+
+ ASSERT(flen > 0);
+
+ pag = xfs_perag_get(mp, agno);
+ spin_lock(&pag->pagb_lock);
+restart:
+ rbp = pag->pagb_tree.rb_node;
+ while (rbp) {
+ struct xfs_busy_extent *busyp =
+ rb_entry(rbp, struct xfs_busy_extent, rb_node);
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ if (fbno + flen <= bbno) {
+ rbp = rbp->rb_left;
+ continue;
+ } else if (fbno >= bend) {
+ rbp = rbp->rb_right;
+ continue;
+ }
+
+ if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
+ userdata))
+ goto restart;
+ }
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+}
+
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy. If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+STATIC void
+xfs_alloc_busy_trim(
+ struct xfs_alloc_arg *args,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ xfs_agblock_t *rbno,
+ xfs_extlen_t *rlen)
+{
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ struct rb_node *rbp;
+
+ ASSERT(len > 0);
+
+ spin_lock(&args->pag->pagb_lock);
+restart:
+ fbno = bno;
+ flen = len;
+ rbp = args->pag->pagb_tree.rb_node;
+ while (rbp && flen >= args->minlen) {
+ struct xfs_busy_extent *busyp =
+ rb_entry(rbp, struct xfs_busy_extent, rb_node);
+ xfs_agblock_t fend = fbno + flen;
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ if (fend <= bbno) {
+ rbp = rbp->rb_left;
+ continue;
+ } else if (fbno >= bend) {
+ rbp = rbp->rb_right;
+ continue;
+ }
+
+ /*
+ * If this is a metadata allocation, try to reuse the busy
+ * extent instead of trimming the allocation.
+ */
+ if (!args->userdata &&
+ !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
+ if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
+ busyp, fbno, flen,
+ false))
+ goto restart;
+ continue;
+ }
+
+ if (bbno <= fbno) {
+ /* start overlap */
+
+ /*
+ * Case 1:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ *
+ * Case 2:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 3:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 4:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------+
+ * fbno fend
+ *
+ * No unbusy region in extent, return failure.
+ */
+ if (fend <= bend)
+ goto fail;
+
+ /*
+ * Case 5:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +----------------------+
+ * fbno fend
+ *
+ * Case 6:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Needs to be trimmed to:
+ * +-------+
+ * fbno fend
+ */
+ fbno = bend;
+ } else if (bend >= fend) {
+ /* end overlap */
+
+ /*
+ * Case 7:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +------------------+
+ * fbno fend
+ *
+ * Case 8:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Needs to be trimmed to:
+ * +-------+
+ * fbno fend
+ */
+ fend = bbno;
+ } else {
+ /* middle overlap */
+
+ /*
+ * Case 9:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------------------------+
+ * fbno fend
+ *
+ * Can be trimmed to:
+ * +-------+ OR +-------+
+ * fbno fend fbno fend
+ *
+ * Backward allocation leads to significant
+ * fragmentation of directories, which degrades
+ * directory performance, therefore we always want to
+ * choose the option that produces forward allocation
+ * patterns.
+ * Preferring the lower bno extent will make the next
+ * request use "fend" as the start of the next
+ * allocation; if the segment is no longer busy at
+ * that point, we'll get a contiguous allocation, but
+ * even if it is still busy, we will get a forward
+ * allocation.
+ * We try to avoid choosing the segment at "bend",
+ * because that can lead to the next allocation
+ * taking the segment at "fbno", which would be a
+ * backward allocation. We only use the segment at
+ * "fbno" if it is much larger than the current
+ * requested size, because in that case there's a
+ * good chance subsequent allocations will be
+ * contiguous.
+ */
+ if (bbno - fbno >= args->maxlen) {
+ /* left candidate fits perfect */
+ fend = bbno;
+ } else if (fend - bend >= args->maxlen * 4) {
+ /* right candidate has enough free space */
+ fbno = bend;
+ } else if (bbno - fbno >= args->minlen) {
+ /* left candidate fits minimum requirement */
+ fend = bbno;
+ } else {
+ goto fail;
+ }
+ }
+
+ flen = fend - fbno;
+ }
+ spin_unlock(&args->pag->pagb_lock);
+
+ if (fbno != bno || flen != len) {
+ trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
+ fbno, flen);
+ }
+ *rbno = fbno;
+ *rlen = flen;
+ return;
+fail:
+ /*
+ * Return a zero extent length as failure indications. All callers
+ * re-check if the trimmed extent satisfies the minlen requirement.
+ */
+ spin_unlock(&args->pag->pagb_lock);
+ trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+ *rbno = fbno;
+ *rlen = 0;
+}
+
+static void
+xfs_alloc_busy_clear_one(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ struct xfs_busy_extent *busyp)
+{
+ if (busyp->length) {
+ trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
+ busyp->length);
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ }
+
+ list_del_init(&busyp->list);
+ kmem_free(busyp);
+}
+
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
+void
+xfs_alloc_busy_clear(
+ struct xfs_mount *mp,
+ struct list_head *list,
+ bool do_discard)
+{
+ struct xfs_busy_extent *busyp, *n;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno = NULLAGNUMBER;
+
+ list_for_each_entry_safe(busyp, n, list, list) {
+ if (busyp->agno != agno) {
+ if (pag) {
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ }
+ pag = xfs_perag_get(mp, busyp->agno);
+ spin_lock(&pag->pagb_lock);
+ agno = busyp->agno;
+ }
+
+ if (do_discard && busyp->length &&
+ !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
+ busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
+ else
+ xfs_alloc_busy_clear_one(mp, pag, busyp);
+ }
+
+ if (pag) {
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ }
+}
+
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_busy_extent_ag_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ return container_of(a, struct xfs_busy_extent, list)->agno -
+ container_of(b, struct xfs_busy_extent, list)->agno;
+}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
new file mode 100644
index 00000000..2f52b924
--- /dev/null
+++ b/fs/xfs/xfs_alloc.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ALLOC_H__
+#define __XFS_ALLOC_H__
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_trans;
+struct xfs_busy_extent;
+
+/*
+ * Freespace allocation types. Argument to xfs_alloc_[v]extent.
+ */
+#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */
+#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */
+#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */
+#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */
+#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */
+#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */
+
+/* this should become an enum again when the tracing code is fixed */
+typedef unsigned int xfs_alloctype_t;
+
+#define XFS_ALLOC_TYPES \
+ { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \
+ { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \
+ { XFS_ALLOCTYPE_START_AG, "START_AG" }, \
+ { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \
+ { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \
+ { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \
+ { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" }
+
+/*
+ * Flags for xfs_alloc_fix_freelist.
+ */
+#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */
+#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks
+ * to 4 + 4*agcount.
+ */
+#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))
+
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ * - the AG superblock, AGF, AGI and AGFL
+ * - the AGF (bno and cnt) and AGI btree root blocks
+ * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp) \
+ ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+
+
+/*
+ * Argument structure for xfs_alloc routines.
+ * This is turned into a structure to avoid having 20 arguments passed
+ * down several levels of the stack.
+ */
+typedef struct xfs_alloc_arg {
+ struct xfs_trans *tp; /* transaction pointer */
+ struct xfs_mount *mp; /* file system mount point */
+ struct xfs_buf *agbp; /* buffer for a.g. freelist header */
+ struct xfs_perag *pag; /* per-ag struct for this agno */
+ xfs_fsblock_t fsbno; /* file system block number */
+ xfs_agnumber_t agno; /* allocation group number */
+ xfs_agblock_t agbno; /* allocation group-relative block # */
+ xfs_extlen_t minlen; /* minimum size of extent */
+ xfs_extlen_t maxlen; /* maximum size of extent */
+ xfs_extlen_t mod; /* mod value for extent size */
+ xfs_extlen_t prod; /* prod value for extent size */
+ xfs_extlen_t minleft; /* min blocks must be left after us */
+ xfs_extlen_t total; /* total blocks needed in xaction */
+ xfs_extlen_t alignment; /* align answer to multiple of this */
+ xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */
+ xfs_extlen_t len; /* output: actual size of extent */
+ xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
+ xfs_alloctype_t otype; /* original allocation type */
+ char wasdel; /* set if allocation was prev delayed */
+ char wasfromfl; /* set if allocation is from freelist */
+ char isfl; /* set if is freelist blocks - !acctg */
+ char userdata; /* set if this is user data */
+ xfs_fsblock_t firstblock; /* io first block allocated */
+} xfs_alloc_arg_t;
+
+/*
+ * Defines for userdata
+ */
+#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
+
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+ struct xfs_perag *pag);
+
+#ifdef __KERNEL__
+void
+xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
+
+void
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
+ bool do_discard);
+
+int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len);
+
+void
+xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+
+int
+xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+
+static inline void xfs_alloc_busy_sort(struct list_head *list)
+{
+ list_sort(NULL, list, xfs_busy_extent_ag_cmp);
+}
+
+#endif /* __KERNEL__ */
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+ struct xfs_mount *mp); /* file system mount structure */
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int /* error */
+xfs_alloc_get_freelist(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *agbp, /* buffer containing the agf structure */
+ xfs_agblock_t *bnop, /* block address retrieved from freelist */
+ int btreeblk); /* destination is a AGF btree */
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *bp, /* buffer for a.g. freelist header */
+ int fields);/* mask of fields to be logged (XFS_AGF_...) */
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int /* error */
+xfs_alloc_pagf_init(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ int flags); /* XFS_ALLOC_FLAGS_... */
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int /* error */
+xfs_alloc_put_freelist(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *agbp, /* buffer for a.g. freelist header */
+ struct xfs_buf *agflbp,/* buffer for a.g. free block array */
+ xfs_agblock_t bno, /* block being freed */
+ int btreeblk); /* owner was a AGF btree */
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int /* error */
+xfs_alloc_read_agf(
+ struct xfs_mount *mp, /* mount point structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ int flags, /* XFS_ALLOC_FLAG_... */
+ struct xfs_buf **bpp); /* buffer for the ag freelist header */
+
+/*
+ * Allocate an extent (variable-size).
+ */
+int /* error */
+xfs_alloc_vextent(
+ xfs_alloc_arg_t *args); /* allocation argument structure */
+
+/*
+ * Free an extent.
+ */
+int /* error */
+xfs_free_extent(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t bno, /* starting block number of extent */
+ xfs_extlen_t len); /* length of extent */
+
+int /* error */
+xfs_alloc_lookup_le(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat); /* success/failure */
+
+int /* error */
+xfs_alloc_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t *bno, /* output: starting block of extent */
+ xfs_extlen_t *len, /* output: length of extent */
+ int *stat); /* output: success/failure */
+
+#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
new file mode 100644
index 00000000..2b351882
--- /dev/null
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_btnum);
+}
+
+STATIC void
+xfs_allocbt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ int btnum = cur->bc_btnum;
+ struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
+
+ ASSERT(ptr->s != 0);
+
+ agf->agf_roots[btnum] = ptr->s;
+ be32_add_cpu(&agf->agf_levels[btnum], inc);
+ pag->pagf_levels[btnum] += inc;
+ xfs_perag_put(pag);
+
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_allocbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int length,
+ int *stat)
+{
+ int error;
+ xfs_agblock_t bno;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ /* Allocate the new block from the freelist. If we can't, give up. */
+ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+ &bno, 1);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+
+ if (bno == NULLAGBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+
+ xfs_trans_agbtree_delta(cur->bc_tp, 1);
+ new->s = cpu_to_be32(bno);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+}
+
+STATIC int
+xfs_allocbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agblock_t bno;
+ int error;
+
+ bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+ error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+ if (error)
+ return error;
+
+ xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ XFS_ALLOC_BUSY_SKIP_DISCARD);
+ xfs_trans_agbtree_delta(cur->bc_tp, -1);
+ return 0;
+}
+
+/*
+ * Update the longest extent in the AGF
+ */
+STATIC void
+xfs_allocbt_update_lastrec(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_rec *rec,
+ int ptr,
+ int reason)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ struct xfs_perag *pag;
+ __be32 len;
+ int numrecs;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+ switch (reason) {
+ case LASTREC_UPDATE:
+ /*
+ * If this is the last leaf block and it's the last record,
+ * then update the size of the longest extent in the AG.
+ */
+ if (ptr != xfs_btree_get_numrecs(block))
+ return;
+ len = rec->alloc.ar_blockcount;
+ break;
+ case LASTREC_INSREC:
+ if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+ be32_to_cpu(agf->agf_longest))
+ return;
+ len = rec->alloc.ar_blockcount;
+ break;
+ case LASTREC_DELREC:
+ numrecs = xfs_btree_get_numrecs(block);
+ if (ptr <= numrecs)
+ return;
+ ASSERT(ptr == numrecs + 1);
+
+ if (numrecs) {
+ xfs_alloc_rec_t *rrp;
+
+ rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+ len = rrp->ar_blockcount;
+ } else {
+ len = 0;
+ }
+
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ agf->agf_longest = len;
+ pag = xfs_perag_get(cur->bc_mp, seqno);
+ pag->pagf_longest = be32_to_cpu(len);
+ xfs_perag_put(pag);
+ xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
+}
+
+STATIC int
+xfs_allocbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_alloc_mnr[level != 0];
+}
+
+STATIC int
+xfs_allocbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_alloc_mxr[level != 0];
+}
+
+STATIC void
+xfs_allocbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ ASSERT(rec->alloc.ar_startblock != 0);
+
+ key->alloc.ar_startblock = rec->alloc.ar_startblock;
+ key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_key(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ ASSERT(key->alloc.ar_startblock != 0);
+
+ rec->alloc.ar_startblock = key->alloc.ar_startblock;
+ rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ ASSERT(cur->bc_rec.a.ar_startblock != 0);
+
+ rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+ rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+}
+
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
+
+ ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC __int64_t
+xfs_allocbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
+ xfs_alloc_key_t *kp = &key->alloc;
+ __int64_t diff;
+
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ return (__int64_t)be32_to_cpu(kp->ar_startblock) -
+ rec->ar_startblock;
+ }
+
+ diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+ if (diff)
+ return diff;
+
+ return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+#ifdef DEBUG
+STATIC int
+xfs_allocbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ return be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock);
+ } else {
+ return be32_to_cpu(k1->alloc.ar_blockcount) <
+ be32_to_cpu(k2->alloc.ar_blockcount) ||
+ (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+ be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock));
+ }
+}
+
+STATIC int
+xfs_allocbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ return be32_to_cpu(r1->alloc.ar_startblock) +
+ be32_to_cpu(r1->alloc.ar_blockcount) <=
+ be32_to_cpu(r2->alloc.ar_startblock);
+ } else {
+ return be32_to_cpu(r1->alloc.ar_blockcount) <
+ be32_to_cpu(r2->alloc.ar_blockcount) ||
+ (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+ be32_to_cpu(r1->alloc.ar_startblock) <
+ be32_to_cpu(r2->alloc.ar_startblock));
+ }
+}
+#endif /* DEBUG */
+
+#ifdef XFS_BTREE_TRACE
+ktrace_t *xfs_allocbt_trace_buf;
+
+STATIC void
+xfs_allocbt_trace_enter(
+ struct xfs_btree_cur *cur,
+ const char *func,
+ char *s,
+ int type,
+ int line,
+ __psunsigned_t a0,
+ __psunsigned_t a1,
+ __psunsigned_t a2,
+ __psunsigned_t a3,
+ __psunsigned_t a4,
+ __psunsigned_t a5,
+ __psunsigned_t a6,
+ __psunsigned_t a7,
+ __psunsigned_t a8,
+ __psunsigned_t a9,
+ __psunsigned_t a10)
+{
+ ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type,
+ (void *)func, (void *)s, NULL, (void *)cur,
+ (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+ (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+ (void *)a8, (void *)a9, (void *)a10);
+}
+
+STATIC void
+xfs_allocbt_trace_cursor(
+ struct xfs_btree_cur *cur,
+ __uint32_t *s0,
+ __uint64_t *l0,
+ __uint64_t *l1)
+{
+ *s0 = cur->bc_private.a.agno;
+ *l0 = cur->bc_rec.a.ar_startblock;
+ *l1 = cur->bc_rec.a.ar_blockcount;
+}
+
+STATIC void
+xfs_allocbt_trace_key(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key,
+ __uint64_t *l0,
+ __uint64_t *l1)
+{
+ *l0 = be32_to_cpu(key->alloc.ar_startblock);
+ *l1 = be32_to_cpu(key->alloc.ar_blockcount);
+}
+
+STATIC void
+xfs_allocbt_trace_record(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ __uint64_t *l0,
+ __uint64_t *l1,
+ __uint64_t *l2)
+{
+ *l0 = be32_to_cpu(rec->alloc.ar_startblock);
+ *l1 = be32_to_cpu(rec->alloc.ar_blockcount);
+ *l2 = 0;
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+ .rec_len = sizeof(xfs_alloc_rec_t),
+ .key_len = sizeof(xfs_alloc_key_t),
+
+ .dup_cursor = xfs_allocbt_dup_cursor,
+ .set_root = xfs_allocbt_set_root,
+ .alloc_block = xfs_allocbt_alloc_block,
+ .free_block = xfs_allocbt_free_block,
+ .update_lastrec = xfs_allocbt_update_lastrec,
+ .get_minrecs = xfs_allocbt_get_minrecs,
+ .get_maxrecs = xfs_allocbt_get_maxrecs,
+ .init_key_from_rec = xfs_allocbt_init_key_from_rec,
+ .init_rec_from_key = xfs_allocbt_init_rec_from_key,
+ .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
+ .key_diff = xfs_allocbt_key_diff,
+
+#ifdef DEBUG
+ .keys_inorder = xfs_allocbt_keys_inorder,
+ .recs_inorder = xfs_allocbt_recs_inorder,
+#endif
+
+#ifdef XFS_BTREE_TRACE
+ .trace_enter = xfs_allocbt_trace_enter,
+ .trace_cursor = xfs_allocbt_trace_cursor,
+ .trace_key = xfs_allocbt_trace_key,
+ .trace_record = xfs_allocbt_trace_record,
+#endif
+};
+
+/*
+ * Allocate a new allocation btree cursor.
+ */
+struct xfs_btree_cur * /* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *agbp, /* buffer for agf structure */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_btnum_t btnum) /* btree identifier */
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_btree_cur *cur;
+
+ ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
+
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]);
+ cur->bc_btnum = btnum;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+ cur->bc_ops = &xfs_allocbt_ops;
+ if (btnum == XFS_BTNUM_CNT)
+ cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+
+ cur->bc_private.a.agbp = agbp;
+ cur->bc_private.a.agno = agno;
+
+ return cur;
+}
+
+/*
+ * Calculate number of records in an alloc btree block.
+ */
+int
+xfs_allocbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+
+ if (leaf)
+ return blocklen / sizeof(xfs_alloc_rec_t);
+ return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
+}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
new file mode 100644
index 00000000..a6caa002
--- /dev/null
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ALLOC_BTREE_H__
+#define __XFS_ALLOC_BTREE_H__
+
+/*
+ * Freespace on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno. All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
+#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec {
+ __be32 ar_startblock; /* starting block number */
+ __be32 ar_blockcount; /* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef struct xfs_alloc_rec_incore {
+ xfs_agblock_t ar_startblock; /* starting block number */
+ xfs_extlen_t ar_blockcount; /* count of free blocks */
+} xfs_alloc_rec_incore_t;
+
+/* btree pointer type */
+typedef __be32 xfs_alloc_ptr_t;
+
+/*
+ * Minimum and maximum blocksize and sectorsize.
+ * The blocksize upper limit is pretty much arbitrary.
+ * The sectorsize upper limit is due to sizeof(sb_sectsize).
+ */
+#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */
+#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */
+#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG)
+#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG)
+#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */
+#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */
+#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG)
+#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG)
+
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+
+/*
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
+ */
+#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+ ((xfs_alloc_rec_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+ ((xfs_alloc_key_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_alloc_key_t)))
+
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_alloc_ptr_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_alloc_key_t) + \
+ ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+ struct xfs_trans *, struct xfs_buf *,
+ xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+
+#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
new file mode 100644
index 00000000..09022493
--- /dev/null
+++ b/fs/xfs/xfs_arch.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ARCH_H__
+#define __XFS_ARCH_H__
+
+#ifndef XFS_BIG_INUMS
+# error XFS_BIG_INUMS must be defined true or false
+#endif
+
+#ifdef __KERNEL__
+
+#include <asm/byteorder.h>
+
+#ifdef __BIG_ENDIAN
+#define XFS_NATIVE_HOST 1
+#else
+#undef XFS_NATIVE_HOST
+#endif
+
+#else /* __KERNEL__ */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define XFS_NATIVE_HOST 1
+#else
+#undef XFS_NATIVE_HOST
+#endif
+
+#ifdef XFS_NATIVE_HOST
+#define cpu_to_be16(val) ((__force __be16)(__u16)(val))
+#define cpu_to_be32(val) ((__force __be32)(__u32)(val))
+#define cpu_to_be64(val) ((__force __be64)(__u64)(val))
+#define be16_to_cpu(val) ((__force __u16)(__be16)(val))
+#define be32_to_cpu(val) ((__force __u32)(__be32)(val))
+#define be64_to_cpu(val) ((__force __u64)(__be64)(val))
+#else
+#define cpu_to_be16(val) ((__force __be16)__swab16((__u16)(val)))
+#define cpu_to_be32(val) ((__force __be32)__swab32((__u32)(val)))
+#define cpu_to_be64(val) ((__force __be64)__swab64((__u64)(val)))
+#define be16_to_cpu(val) (__swab16((__force __u16)(__be16)(val)))
+#define be32_to_cpu(val) (__swab32((__force __u32)(__be32)(val)))
+#define be64_to_cpu(val) (__swab64((__force __u64)(__be64)(val)))
+#endif
+
+static inline void be16_add_cpu(__be16 *a, __s16 b)
+{
+ *a = cpu_to_be16(be16_to_cpu(*a) + b);
+}
+
+static inline void be32_add_cpu(__be32 *a, __s32 b)
+{
+ *a = cpu_to_be32(be32_to_cpu(*a) + b);
+}
+
+static inline void be64_add_cpu(__be64 *a, __s64 b)
+{
+ *a = cpu_to_be64(be64_to_cpu(*a) + b);
+}
+
+#endif /* __KERNEL__ */
+
+/*
+ * get and set integers from potentially unaligned locations
+ */
+
+#define INT_GET_UNALIGNED_16_BE(pointer) \
+ ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1])))
+#define INT_SET_UNALIGNED_16_BE(pointer,value) \
+ { \
+ ((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \
+ ((__u8*)(pointer))[1] = (((value) ) & 0xff); \
+ }
+
+/*
+ * In directories inode numbers are stored as unaligned arrays of unsigned
+ * 8bit integers on disk.
+ *
+ * For v1 directories or v2 directories that contain inode numbers that
+ * do not fit into 32bit the array has eight members, but the first member
+ * is always zero:
+ *
+ * |unused|48-55|40-47|32-39|24-31|16-23| 8-15| 0- 7|
+ *
+ * For v2 directories that only contain entries with inode numbers that fit
+ * into 32bits a four-member array is used:
+ *
+ * |24-31|16-23| 8-15| 0- 7|
+ */
+
+#define XFS_GET_DIR_INO4(di) \
+ (((__u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
+
+#define XFS_PUT_DIR_INO4(from, di) \
+do { \
+ (di).i[0] = (((from) & 0xff000000ULL) >> 24); \
+ (di).i[1] = (((from) & 0x00ff0000ULL) >> 16); \
+ (di).i[2] = (((from) & 0x0000ff00ULL) >> 8); \
+ (di).i[3] = ((from) & 0x000000ffULL); \
+} while (0)
+
+#define XFS_DI_HI(di) \
+ (((__u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
+#define XFS_DI_LO(di) \
+ (((__u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7]))
+
+#define XFS_GET_DIR_INO8(di) \
+ (((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \
+ ((xfs_ino_t)XFS_DI_HI(di) << 32))
+
+#define XFS_PUT_DIR_INO8(from, di) \
+do { \
+ (di).i[0] = 0; \
+ (di).i[1] = (((from) & 0x00ff000000000000ULL) >> 48); \
+ (di).i[2] = (((from) & 0x0000ff0000000000ULL) >> 40); \
+ (di).i[3] = (((from) & 0x000000ff00000000ULL) >> 32); \
+ (di).i[4] = (((from) & 0x00000000ff000000ULL) >> 24); \
+ (di).i[5] = (((from) & 0x0000000000ff0000ULL) >> 16); \
+ (di).i[6] = (((from) & 0x000000000000ff00ULL) >> 8); \
+ (di).i[7] = ((from) & 0x00000000000000ffULL); \
+} while (0)
+
+#endif /* __XFS_ARCH_H__ */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
new file mode 100644
index 00000000..99d40116
--- /dev/null
+++ b/fs/xfs/xfs_attr.c
@@ -0,0 +1,2230 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_rw.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+/*
+ * xfs_attr.c
+ *
+ * Provide the external interfaces to manage attribute lists.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute list fits inside the inode.
+ */
+STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is one block.
+ */
+STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
+
+/*
+ * Internal routines when attribute list is more than one block.
+ */
+STATIC int xfs_attr_node_get(xfs_da_args_t *args);
+STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
+STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
+STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
+
+/*
+ * Routines to manipulate out-of-line attribute values.
+ */
+STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
+STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
+
+#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
+
+STATIC int
+xfs_attr_name_to_xname(
+ struct xfs_name *xname,
+ const unsigned char *aname)
+{
+ if (!aname)
+ return EINVAL;
+ xname->name = aname;
+ xname->len = strlen((char *)aname);
+ if (xname->len >= MAXNAMELEN)
+ return EFAULT; /* match IRIX behaviour */
+
+ return 0;
+}
+
+STATIC int
+xfs_inode_hasattr(
+ struct xfs_inode *ip)
+{
+ if (!XFS_IFORK_Q(ip) ||
+ (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_anextents == 0))
+ return 0;
+ return 1;
+}
+
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+
+STATIC int
+xfs_attr_get_int(
+ struct xfs_inode *ip,
+ struct xfs_name *name,
+ unsigned char *value,
+ int *valuelenp,
+ int flags)
+{
+ xfs_da_args_t args;
+ int error;
+
+ if (!xfs_inode_hasattr(ip))
+ return ENOATTR;
+
+ /*
+ * Fill in the arg structure for this request.
+ */
+ memset((char *)&args, 0, sizeof(args));
+ args.name = name->name;
+ args.namelen = name->len;
+ args.value = value;
+ args.valuelen = *valuelenp;
+ args.flags = flags;
+ args.hashval = xfs_da_hashname(args.name, args.namelen);
+ args.dp = ip;
+ args.whichfork = XFS_ATTR_FORK;
+
+ /*
+ * Decide on what work routines to call based on the inode size.
+ */
+ if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ error = xfs_attr_shortform_getvalue(&args);
+ } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
+ error = xfs_attr_leaf_get(&args);
+ } else {
+ error = xfs_attr_node_get(&args);
+ }
+
+ /*
+ * Return the number of bytes in the value to the caller.
+ */
+ *valuelenp = args.valuelen;
+
+ if (error == EEXIST)
+ error = 0;
+ return(error);
+}
+
+int
+xfs_attr_get(
+ xfs_inode_t *ip,
+ const unsigned char *name,
+ unsigned char *value,
+ int *valuelenp,
+ int flags)
+{
+ int error;
+ struct xfs_name xname;
+
+ XFS_STATS_INC(xs_attr_get);
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return(EIO);
+
+ error = xfs_attr_name_to_xname(&xname, name);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return(error);
+}
+
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+STATIC int
+xfs_attr_calc_size(
+ struct xfs_inode *ip,
+ int namelen,
+ int valuelen,
+ int *local)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int size;
+ int nblks;
+
+ /*
+ * Determine space new attribute will use, and if it would be
+ * "local" or "remote" (note: local != inline).
+ */
+ size = xfs_attr_leaf_newentsize(namelen, valuelen,
+ mp->m_sb.sb_blocksize, local);
+
+ nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+ if (*local) {
+ if (size > (mp->m_sb.sb_blocksize >> 1)) {
+ /* Double split possible */
+ nblks *= 2;
+ }
+ } else {
+ /*
+ * Out of line attribute, cannot double split, but
+ * make room for the attribute value itself.
+ */
+ uint dblocks = XFS_B_TO_FSB(mp, valuelen);
+ nblks += dblocks;
+ nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+ }
+
+ return nblks;
+}
+
+STATIC int
+xfs_attr_set_int(
+ struct xfs_inode *dp,
+ struct xfs_name *name,
+ unsigned char *value,
+ int valuelen,
+ int flags)
+{
+ xfs_da_args_t args;
+ xfs_fsblock_t firstblock;
+ xfs_bmap_free_t flist;
+ int error, err2, committed;
+ xfs_mount_t *mp = dp->i_mount;
+ int rsvd = (flags & ATTR_ROOT) != 0;
+ int local;
+
+ /*
+ * Attach the dquots to the inode.
+ */
+ error = xfs_qm_dqattach(dp, 0);
+ if (error)
+ return error;
+
+ /*
+ * If the inode doesn't have an attribute fork, add one.
+ * (inode must not be locked when we call this routine)
+ */
+ if (XFS_IFORK_Q(dp) == 0) {
+ int sf_size = sizeof(xfs_attr_sf_hdr_t) +
+ XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen);
+
+ if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd)))
+ return(error);
+ }
+
+ /*
+ * Fill in the arg structure for this request.
+ */
+ memset((char *)&args, 0, sizeof(args));
+ args.name = name->name;
+ args.namelen = name->len;
+ args.value = value;
+ args.valuelen = valuelen;
+ args.flags = flags;
+ args.hashval = xfs_da_hashname(args.name, args.namelen);
+ args.dp = dp;
+ args.firstblock = &firstblock;
+ args.flist = &flist;
+ args.whichfork = XFS_ATTR_FORK;
+ args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+
+ /* Size is now blocks for attribute data */
+ args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local);
+
+ /*
+ * Start our first transaction of the day.
+ *
+ * All future transactions during this code must be "chained" off
+ * this one via the trans_dup() call. All transactions will contain
+ * the inode, and the inode will always be marked with trans_ihold().
+ * Since the inode will be locked in all transactions, we must log
+ * the inode in every transaction to let it float upward through
+ * the log.
+ */
+ args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+
+ /*
+ * Root fork attributes can use reserved data blocks for this
+ * operation if necessary
+ */
+
+ if (rsvd)
+ args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+ if ((error = xfs_trans_reserve(args.trans, args.total,
+ XFS_ATTRSET_LOG_RES(mp, args.total), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
+ xfs_trans_cancel(args.trans, 0);
+ return(error);
+ }
+ xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+ error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
+ rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+ XFS_QMOPT_RES_REGBLKS);
+ if (error) {
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ return (error);
+ }
+
+ xfs_trans_ijoin(args.trans, dp);
+
+ /*
+ * If the attribute list is non-existent or a shortform list,
+ * upgrade it to a single-leaf-block attribute list.
+ */
+ if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+ ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) &&
+ (dp->i_d.di_anextents == 0))) {
+
+ /*
+ * Build initial attribute list (if required).
+ */
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+ xfs_attr_shortform_create(&args);
+
+ /*
+ * Try to add the attr to the attribute list in
+ * the inode.
+ */
+ error = xfs_attr_shortform_addname(&args);
+ if (error != ENOSPC) {
+ /*
+ * Commit the shortform mods, and we're done.
+ * NOTE: this is also the error path (EEXIST, etc).
+ */
+ ASSERT(args.trans != NULL);
+
+ /*
+ * If this is a synchronous mount, make sure that
+ * the transaction goes to disk before returning
+ * to the user.
+ */
+ if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ xfs_trans_set_sync(args.trans);
+ }
+
+ if (!error && (flags & ATTR_KERNOTIME) == 0) {
+ xfs_trans_ichgtime(args.trans, dp,
+ XFS_ICHGTIME_CHG);
+ }
+ err2 = xfs_trans_commit(args.trans,
+ XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+ return(error == 0 ? err2 : error);
+ }
+
+ /*
+ * It won't fit in the shortform, transform to a leaf block.
+ * GROT: another possible req'mt for a double-split btree op.
+ */
+ xfs_bmap_init(args.flist, args.firstblock);
+ error = xfs_attr_shortform_to_leaf(&args);
+ if (!error) {
+ error = xfs_bmap_finish(&args.trans, args.flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args.trans = NULL;
+ xfs_bmap_cancel(&flist);
+ goto out;
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args.trans, dp);
+
+ /*
+ * Commit the leaf transformation. We'll need another (linked)
+ * transaction to add the new attribute to the leaf.
+ */
+
+ error = xfs_trans_roll(&args.trans, dp);
+ if (error)
+ goto out;
+
+ }
+
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ error = xfs_attr_leaf_addname(&args);
+ } else {
+ error = xfs_attr_node_addname(&args);
+ }
+ if (error) {
+ goto out;
+ }
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * transaction goes to disk before returning to the user.
+ */
+ if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ xfs_trans_set_sync(args.trans);
+ }
+
+ if ((flags & ATTR_KERNOTIME) == 0)
+ xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+ /*
+ * Commit the last in the sequence of transactions.
+ */
+ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+ error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+ return(error);
+
+out:
+ if (args.trans)
+ xfs_trans_cancel(args.trans,
+ XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ return(error);
+}
+
+int
+xfs_attr_set(
+ xfs_inode_t *dp,
+ const unsigned char *name,
+ unsigned char *value,
+ int valuelen,
+ int flags)
+{
+ int error;
+ struct xfs_name xname;
+
+ XFS_STATS_INC(xs_attr_set);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return (EIO);
+
+ error = xfs_attr_name_to_xname(&xname, name);
+ if (error)
+ return error;
+
+ return xfs_attr_set_int(dp, &xname, value, valuelen, flags);
+}
+
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
+STATIC int
+xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
+{
+ xfs_da_args_t args;
+ xfs_fsblock_t firstblock;
+ xfs_bmap_free_t flist;
+ int error;
+ xfs_mount_t *mp = dp->i_mount;
+
+ /*
+ * Fill in the arg structure for this request.
+ */
+ memset((char *)&args, 0, sizeof(args));
+ args.name = name->name;
+ args.namelen = name->len;
+ args.flags = flags;
+ args.hashval = xfs_da_hashname(args.name, args.namelen);
+ args.dp = dp;
+ args.firstblock = &firstblock;
+ args.flist = &flist;
+ args.total = 0;
+ args.whichfork = XFS_ATTR_FORK;
+
+ /*
+ * we have no control over the attribute names that userspace passes us
+ * to remove, so we have to allow the name lookup prior to attribute
+ * removal to fail.
+ */
+ args.op_flags = XFS_DA_OP_OKNOENT;
+
+ /*
+ * Attach the dquots to the inode.
+ */
+ error = xfs_qm_dqattach(dp, 0);
+ if (error)
+ return error;
+
+ /*
+ * Start our first transaction of the day.
+ *
+ * All future transactions during this code must be "chained" off
+ * this one via the trans_dup() call. All transactions will contain
+ * the inode, and the inode will always be marked with trans_ihold().
+ * Since the inode will be locked in all transactions, we must log
+ * the inode in every transaction to let it float upward through
+ * the log.
+ */
+ args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
+
+ /*
+ * Root fork attributes can use reserved data blocks for this
+ * operation if necessary
+ */
+
+ if (flags & ATTR_ROOT)
+ args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+ if ((error = xfs_trans_reserve(args.trans,
+ XFS_ATTRRM_SPACE_RES(mp),
+ XFS_ATTRRM_LOG_RES(mp),
+ 0, XFS_TRANS_PERM_LOG_RES,
+ XFS_ATTRRM_LOG_COUNT))) {
+ xfs_trans_cancel(args.trans, 0);
+ return(error);
+ }
+
+ xfs_ilock(dp, XFS_ILOCK_EXCL);
+ /*
+ * No need to make quota reservations here. We expect to release some
+ * blocks not allocate in the common case.
+ */
+ xfs_trans_ijoin(args.trans, dp);
+
+ /*
+ * Decide on what work routines to call based on the inode size.
+ */
+ if (!xfs_inode_hasattr(dp)) {
+ error = XFS_ERROR(ENOATTR);
+ goto out;
+ }
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+ error = xfs_attr_shortform_remove(&args);
+ if (error) {
+ goto out;
+ }
+ } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ error = xfs_attr_leaf_removename(&args);
+ } else {
+ error = xfs_attr_node_removename(&args);
+ }
+ if (error) {
+ goto out;
+ }
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * transaction goes to disk before returning to the user.
+ */
+ if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ xfs_trans_set_sync(args.trans);
+ }
+
+ if ((flags & ATTR_KERNOTIME) == 0)
+ xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+
+ /*
+ * Commit the last in the sequence of transactions.
+ */
+ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+ error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+ return(error);
+
+out:
+ if (args.trans)
+ xfs_trans_cancel(args.trans,
+ XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ return(error);
+}
+
+int
+xfs_attr_remove(
+ xfs_inode_t *dp,
+ const unsigned char *name,
+ int flags)
+{
+ int error;
+ struct xfs_name xname;
+
+ XFS_STATS_INC(xs_attr_remove);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return (EIO);
+
+ error = xfs_attr_name_to_xname(&xname, name);
+ if (error)
+ return error;
+
+ xfs_ilock(dp, XFS_ILOCK_SHARED);
+ if (!xfs_inode_hasattr(dp)) {
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+ return XFS_ERROR(ENOATTR);
+ }
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+ return xfs_attr_remove_int(dp, &xname, flags);
+}
+
+int
+xfs_attr_list_int(xfs_attr_list_context_t *context)
+{
+ int error;
+ xfs_inode_t *dp = context->dp;
+
+ XFS_STATS_INC(xs_attr_list);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return EIO;
+
+ xfs_ilock(dp, XFS_ILOCK_SHARED);
+
+ /*
+ * Decide on what work routines to call based on the inode size.
+ */
+ if (!xfs_inode_hasattr(dp)) {
+ error = 0;
+ } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ error = xfs_attr_shortform_list(context);
+ } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ error = xfs_attr_leaf_list(context);
+ } else {
+ error = xfs_attr_node_list(context);
+ }
+
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+ return error;
+}
+
+#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
+ (((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
+ ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+ & ~(sizeof(u_int32_t)-1))
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_attr_put_listent(
+ xfs_attr_list_context_t *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ int valuelen,
+ unsigned char *value)
+{
+ struct attrlist *alist = (struct attrlist *)context->alist;
+ attrlist_ent_t *aep;
+ int arraytop;
+
+ ASSERT(!(context->flags & ATTR_KERNOVAL));
+ ASSERT(context->count >= 0);
+ ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+ ASSERT(context->firstu >= sizeof(*alist));
+ ASSERT(context->firstu <= context->bufsize);
+
+ /*
+ * Only list entries in the right namespace.
+ */
+ if (((context->flags & ATTR_SECURE) == 0) !=
+ ((flags & XFS_ATTR_SECURE) == 0))
+ return 0;
+ if (((context->flags & ATTR_ROOT) == 0) !=
+ ((flags & XFS_ATTR_ROOT) == 0))
+ return 0;
+
+ arraytop = sizeof(*alist) +
+ context->count * sizeof(alist->al_offset[0]);
+ context->firstu -= ATTR_ENTSIZE(namelen);
+ if (context->firstu < arraytop) {
+ trace_xfs_attr_list_full(context);
+ alist->al_more = 1;
+ context->seen_enough = 1;
+ return 1;
+ }
+
+ aep = (attrlist_ent_t *)&context->alist[context->firstu];
+ aep->a_valuelen = valuelen;
+ memcpy(aep->a_name, name, namelen);
+ aep->a_name[namelen] = 0;
+ alist->al_offset[context->count++] = context->firstu;
+ alist->al_count = context->count;
+ trace_xfs_attr_list_add(context);
+ return 0;
+}
+
+/*
+ * Generate a list of extended attribute names and optionally
+ * also value lengths. Positive return value follows the XFS
+ * convention of being an error, zero or negative return code
+ * is the length of the buffer returned (negated), indicating
+ * success.
+ */
+int
+xfs_attr_list(
+ xfs_inode_t *dp,
+ char *buffer,
+ int bufsize,
+ int flags,
+ attrlist_cursor_kern_t *cursor)
+{
+ xfs_attr_list_context_t context;
+ struct attrlist *alist;
+ int error;
+
+ /*
+ * Validate the cursor.
+ */
+ if (cursor->pad1 || cursor->pad2)
+ return(XFS_ERROR(EINVAL));
+ if ((cursor->initted == 0) &&
+ (cursor->hashval || cursor->blkno || cursor->offset))
+ return XFS_ERROR(EINVAL);
+
+ /*
+ * Check for a properly aligned buffer.
+ */
+ if (((long)buffer) & (sizeof(int)-1))
+ return XFS_ERROR(EFAULT);
+ if (flags & ATTR_KERNOVAL)
+ bufsize = 0;
+
+ /*
+ * Initialize the output buffer.
+ */
+ memset(&context, 0, sizeof(context));
+ context.dp = dp;
+ context.cursor = cursor;
+ context.resynch = 1;
+ context.flags = flags;
+ context.alist = buffer;
+ context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
+ context.firstu = context.bufsize;
+ context.put_listent = xfs_attr_put_listent;
+
+ alist = (struct attrlist *)context.alist;
+ alist->al_count = 0;
+ alist->al_more = 0;
+ alist->al_offset[0] = context.bufsize;
+
+ error = xfs_attr_list_int(&context);
+ ASSERT(error >= 0);
+ return error;
+}
+
+int /* error */
+xfs_attr_inactive(xfs_inode_t *dp)
+{
+ xfs_trans_t *trans;
+ xfs_mount_t *mp;
+ int error;
+
+ mp = dp->i_mount;
+ ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+
+ xfs_ilock(dp, XFS_ILOCK_SHARED);
+ if (!xfs_inode_hasattr(dp) ||
+ dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+ return 0;
+ }
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+ /*
+ * Start our first transaction of the day.
+ *
+ * All future transactions during this code must be "chained" off
+ * this one via the trans_dup() call. All transactions will contain
+ * the inode, and the inode will always be marked with trans_ihold().
+ * Since the inode will be locked in all transactions, we must log
+ * the inode in every transaction to let it float upward through
+ * the log.
+ */
+ trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
+ if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_ATTRINVAL_LOG_COUNT))) {
+ xfs_trans_cancel(trans, 0);
+ return(error);
+ }
+ xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+ /*
+ * No need to make quota reservations here. We expect to release some
+ * blocks, not allocate, in the common case.
+ */
+ xfs_trans_ijoin(trans, dp);
+
+ /*
+ * Decide on what work routines to call based on the inode size.
+ */
+ if (!xfs_inode_hasattr(dp) ||
+ dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ error = 0;
+ goto out;
+ }
+ error = xfs_attr_root_inactive(&trans, dp);
+ if (error)
+ goto out;
+
+ error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK, 0);
+ if (error)
+ goto out;
+
+ /*
+ * Commit the last in the sequence of transactions.
+ */
+ xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+ error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+ return(error);
+
+out:
+ xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ return(error);
+}
+
+
+
+/*========================================================================
+ * External routines when attribute list is inside the inode
+ *========================================================================*/
+
+/*
+ * Add a name to the shortform attribute list structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_attr_shortform_addname(xfs_da_args_t *args)
+{
+ int newsize, forkoff, retval;
+
+ retval = xfs_attr_shortform_lookup(args);
+ if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+ return(retval);
+ } else if (retval == EEXIST) {
+ if (args->flags & ATTR_CREATE)
+ return(retval);
+ retval = xfs_attr_shortform_remove(args);
+ ASSERT(retval == 0);
+ }
+
+ if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
+ args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+ return(XFS_ERROR(ENOSPC));
+
+ newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
+ newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+
+ forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
+ if (!forkoff)
+ return(XFS_ERROR(ENOSPC));
+
+ xfs_attr_shortform_add(args, forkoff);
+ return(0);
+}
+
+
+/*========================================================================
+ * External routines when attribute list is one block
+ *========================================================================*/
+
+/*
+ * Add a name to the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_addname(xfs_da_args_t *args)
+{
+ xfs_inode_t *dp;
+ xfs_dabuf_t *bp;
+ int retval, error, committed, forkoff;
+
+ /*
+ * Read the (only) block in the attribute list in.
+ */
+ dp = args->dp;
+ args->blkno = 0;
+ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+
+ /*
+ * Look up the given attribute in the leaf block. Figure out if
+ * the given flags produce an error or call for an atomic rename.
+ */
+ retval = xfs_attr_leaf_lookup_int(bp, args);
+ if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+ xfs_da_brelse(args->trans, bp);
+ return(retval);
+ } else if (retval == EEXIST) {
+ if (args->flags & ATTR_CREATE) { /* pure create op */
+ xfs_da_brelse(args->trans, bp);
+ return(retval);
+ }
+ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
+ args->blkno2 = args->blkno; /* set 2nd entry info*/
+ args->index2 = args->index;
+ args->rmtblkno2 = args->rmtblkno;
+ args->rmtblkcnt2 = args->rmtblkcnt;
+ }
+
+ /*
+ * Add the attribute to the leaf block, transitioning to a Btree
+ * if required.
+ */
+ retval = xfs_attr_leaf_add(bp, args);
+ xfs_da_buf_done(bp);
+ if (retval == ENOSPC) {
+ /*
+ * Promote the attribute list to the Btree format, then
+ * Commit that transaction so that the node_addname() call
+ * can manage its own transactions.
+ */
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_attr_leaf_to_node(args);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ return(error);
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
+
+ /*
+ * Commit the current trans (including the inode) and start
+ * a new one.
+ */
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
+ return (error);
+
+ /*
+ * Fob the whole rest of the problem off on the Btree code.
+ */
+ error = xfs_attr_node_addname(args);
+ return(error);
+ }
+
+ /*
+ * Commit the transaction that added the attr name so that
+ * later routines can manage their own transactions.
+ */
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
+ return (error);
+
+ /*
+ * If there was an out-of-line value, allocate the blocks we
+ * identified for its storage and copy the value. This is done
+ * after we create the attribute so that we don't overflow the
+ * maximum size of a transaction and/or hit a deadlock.
+ */
+ if (args->rmtblkno > 0) {
+ error = xfs_attr_rmtval_set(args);
+ if (error)
+ return(error);
+ }
+
+ /*
+ * If this is an atomic rename operation, we must "flip" the
+ * incomplete flags on the "new" and "old" attribute/value pairs
+ * so that one disappears and one appears atomically. Then we
+ * must remove the "old" attribute/value pair.
+ */
+ if (args->op_flags & XFS_DA_OP_RENAME) {
+ /*
+ * In a separate transaction, set the incomplete flag on the
+ * "old" attr and clear the incomplete flag on the "new" attr.
+ */
+ error = xfs_attr_leaf_flipflags(args);
+ if (error)
+ return(error);
+
+ /*
+ * Dismantle the "old" attribute/value pair by removing
+ * a "remote" value (if it exists).
+ */
+ args->index = args->index2;
+ args->blkno = args->blkno2;
+ args->rmtblkno = args->rmtblkno2;
+ args->rmtblkcnt = args->rmtblkcnt2;
+ if (args->rmtblkno) {
+ error = xfs_attr_rmtval_remove(args);
+ if (error)
+ return(error);
+ }
+
+ /*
+ * Read in the block containing the "old" attr, then
+ * remove the "old" attr from that block (neat, huh!)
+ */
+ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
+ &bp, XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+ (void)xfs_attr_leaf_remove(bp, args);
+
+ /*
+ * If the result is small enough, shrink it all into the inode.
+ */
+ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans,
+ args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ return(error);
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans
+ * and started a new one. We need the inode to be
+ * in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
+ } else
+ xfs_da_buf_done(bp);
+
+ /*
+ * Commit the remove and start the next trans in series.
+ */
+ error = xfs_trans_roll(&args->trans, dp);
+
+ } else if (args->rmtblkno > 0) {
+ /*
+ * Added a "remote" value, just clear the incomplete flag.
+ */
+ error = xfs_attr_leaf_clearflag(args);
+ }
+ return(error);
+}
+
+/*
+ * Remove a name from the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_removename(xfs_da_args_t *args)
+{
+ xfs_inode_t *dp;
+ xfs_dabuf_t *bp;
+ int error, committed, forkoff;
+
+ /*
+ * Remove the attribute.
+ */
+ dp = args->dp;
+ args->blkno = 0;
+ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error) {
+ return(error);
+ }
+
+ ASSERT(bp != NULL);
+ error = xfs_attr_leaf_lookup_int(bp, args);
+ if (error == ENOATTR) {
+ xfs_da_brelse(args->trans, bp);
+ return(error);
+ }
+
+ (void)xfs_attr_leaf_remove(bp, args);
+
+ /*
+ * If the result is small enough, shrink it all into the inode.
+ */
+ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ return(error);
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
+ } else
+ xfs_da_buf_done(bp);
+ return(0);
+}
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_get(xfs_da_args_t *args)
+{
+ xfs_dabuf_t *bp;
+ int error;
+
+ args->blkno = 0;
+ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+
+ error = xfs_attr_leaf_lookup_int(bp, args);
+ if (error != EEXIST) {
+ xfs_da_brelse(args->trans, bp);
+ return(error);
+ }
+ error = xfs_attr_leaf_getvalue(bp, args);
+ xfs_da_brelse(args->trans, bp);
+ if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
+ error = xfs_attr_rmtval_get(args);
+ }
+ return(error);
+}
+
+/*
+ * Copy out attribute entries for attr_list(), for leaf attribute lists.
+ */
+STATIC int
+xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+{
+ xfs_attr_leafblock_t *leaf;
+ int error;
+ xfs_dabuf_t *bp;
+
+ context->cursor->blkno = 0;
+ error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
+ if (error)
+ return XFS_ERROR(error);
+ ASSERT(bp != NULL);
+ leaf = bp->data;
+ if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_ATTR_LEAF_MAGIC)) {
+ XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
+ context->dp->i_mount, leaf);
+ xfs_da_brelse(NULL, bp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ error = xfs_attr_leaf_list_int(bp, context);
+ xfs_da_brelse(NULL, bp);
+ return XFS_ERROR(error);
+}
+
+
+/*========================================================================
+ * External routines when attribute list size > XFS_LBSIZE(mp).
+ *========================================================================*/
+
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ *
+ * "Remote" attribute values confuse the issue and atomic rename operations
+ * add a whole extra layer of confusion on top of that.
+ */
+STATIC int
+xfs_attr_node_addname(xfs_da_args_t *args)
+{
+ xfs_da_state_t *state;
+ xfs_da_state_blk_t *blk;
+ xfs_inode_t *dp;
+ xfs_mount_t *mp;
+ int committed, retval, error;
+
+ /*
+ * Fill in bucket of arguments/results/context to carry around.
+ */
+ dp = args->dp;
+ mp = dp->i_mount;
+restart:
+ state = xfs_da_state_alloc();
+ state->args = args;
+ state->mp = mp;
+ state->blocksize = state->mp->m_sb.sb_blocksize;
+ state->node_ents = state->mp->m_attr_node_ents;
+
+ /*
+ * Search to see if name already exists, and get back a pointer
+ * to where it should go.
+ */
+ error = xfs_da_node_lookup_int(state, &retval);
+ if (error)
+ goto out;
+ blk = &state->path.blk[ state->path.active-1 ];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+ if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+ goto out;
+ } else if (retval == EEXIST) {
+ if (args->flags & ATTR_CREATE)
+ goto out;
+ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
+ args->blkno2 = args->blkno; /* set 2nd entry info*/
+ args->index2 = args->index;
+ args->rmtblkno2 = args->rmtblkno;
+ args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ }
+
+ retval = xfs_attr_leaf_add(blk->bp, state->args);
+ if (retval == ENOSPC) {
+ if (state->path.active == 1) {
+ /*
+ * Its really a single leaf node, but it had
+ * out-of-line values so it looked like it *might*
+ * have been a b-tree.
+ */
+ xfs_da_state_free(state);
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_attr_leaf_to_node(args);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans,
+ args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ goto out;
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans
+ * and started a new one. We need the inode to be
+ * in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
+
+ /*
+ * Commit the node conversion and start the next
+ * trans in the chain.
+ */
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
+ goto out;
+
+ goto restart;
+ }
+
+ /*
+ * Split as many Btree elements as required.
+ * This code tracks the new and old attr's location
+ * in the index/blkno/rmtblkno/rmtblkcnt fields and
+ * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
+ */
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_da_split(state);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ goto out;
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
+ } else {
+ /*
+ * Addition succeeded, update Btree hashvals.
+ */
+ xfs_da_fixhashpath(state, &state->path);
+ }
+
+ /*
+ * Kill the state structure, we're done with it and need to
+ * allow the buffers to come back later.
+ */
+ xfs_da_state_free(state);
+ state = NULL;
+
+ /*
+ * Commit the leaf addition or btree split and start the next
+ * trans in the chain.
+ */
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
+ goto out;
+
+ /*
+ * If there was an out-of-line value, allocate the blocks we
+ * identified for its storage and copy the value. This is done
+ * after we create the attribute so that we don't overflow the
+ * maximum size of a transaction and/or hit a deadlock.
+ */
+ if (args->rmtblkno > 0) {
+ error = xfs_attr_rmtval_set(args);
+ if (error)
+ return(error);
+ }
+
+ /*
+ * If this is an atomic rename operation, we must "flip" the
+ * incomplete flags on the "new" and "old" attribute/value pairs
+ * so that one disappears and one appears atomically. Then we
+ * must remove the "old" attribute/value pair.
+ */
+ if (args->op_flags & XFS_DA_OP_RENAME) {
+ /*
+ * In a separate transaction, set the incomplete flag on the
+ * "old" attr and clear the incomplete flag on the "new" attr.
+ */
+ error = xfs_attr_leaf_flipflags(args);
+ if (error)
+ goto out;
+
+ /*
+ * Dismantle the "old" attribute/value pair by removing
+ * a "remote" value (if it exists).
+ */
+ args->index = args->index2;
+ args->blkno = args->blkno2;
+ args->rmtblkno = args->rmtblkno2;
+ args->rmtblkcnt = args->rmtblkcnt2;
+ if (args->rmtblkno) {
+ error = xfs_attr_rmtval_remove(args);
+ if (error)
+ return(error);
+ }
+
+ /*
+ * Re-find the "old" attribute entry after any split ops.
+ * The INCOMPLETE flag means that we will find the "old"
+ * attr, not the "new" one.
+ */
+ args->flags |= XFS_ATTR_INCOMPLETE;
+ state = xfs_da_state_alloc();
+ state->args = args;
+ state->mp = mp;
+ state->blocksize = state->mp->m_sb.sb_blocksize;
+ state->node_ents = state->mp->m_attr_node_ents;
+ state->inleaf = 0;
+ error = xfs_da_node_lookup_int(state, &retval);
+ if (error)
+ goto out;
+
+ /*
+ * Remove the name and update the hashvals in the tree.
+ */
+ blk = &state->path.blk[ state->path.active-1 ];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+ error = xfs_attr_leaf_remove(blk->bp, args);
+ xfs_da_fixhashpath(state, &state->path);
+
+ /*
+ * Check to see if the tree needs to be collapsed.
+ */
+ if (retval && (state->path.active > 1)) {
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_da_join(state);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans,
+ args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ goto out;
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans
+ * and started a new one. We need the inode to be
+ * in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
+ }
+
+ /*
+ * Commit and start the next trans in the chain.
+ */
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
+ goto out;
+
+ } else if (args->rmtblkno > 0) {
+ /*
+ * Added a "remote" value, just clear the incomplete flag.
+ */
+ error = xfs_attr_leaf_clearflag(args);
+ if (error)
+ goto out;
+ }
+ retval = error = 0;
+
+out:
+ if (state)
+ xfs_da_state_free(state);
+ if (error)
+ return(error);
+ return(retval);
+}
+
+/*
+ * Remove a name from a B-tree attribute list.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_attr_node_removename(xfs_da_args_t *args)
+{
+ xfs_da_state_t *state;
+ xfs_da_state_blk_t *blk;
+ xfs_inode_t *dp;
+ xfs_dabuf_t *bp;
+ int retval, error, committed, forkoff;
+
+ /*
+ * Tie a string around our finger to remind us where we are.
+ */
+ dp = args->dp;
+ state = xfs_da_state_alloc();
+ state->args = args;
+ state->mp = dp->i_mount;
+ state->blocksize = state->mp->m_sb.sb_blocksize;
+ state->node_ents = state->mp->m_attr_node_ents;
+
+ /*
+ * Search to see if name exists, and get back a pointer to it.
+ */
+ error = xfs_da_node_lookup_int(state, &retval);
+ if (error || (retval != EEXIST)) {
+ if (error == 0)
+ error = retval;
+ goto out;
+ }
+
+ /*
+ * If there is an out-of-line value, de-allocate the blocks.
+ * This is done before we remove the attribute so that we don't
+ * overflow the maximum size of a transaction and/or hit a deadlock.
+ */
+ blk = &state->path.blk[ state->path.active-1 ];
+ ASSERT(blk->bp != NULL);
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+ if (args->rmtblkno > 0) {
+ /*
+ * Fill in disk block numbers in the state structure
+ * so that we can get the buffers back after we commit
+ * several transactions in the following calls.
+ */
+ error = xfs_attr_fillstate(state);
+ if (error)
+ goto out;
+
+ /*
+ * Mark the attribute as INCOMPLETE, then bunmapi() the
+ * remote value.
+ */
+ error = xfs_attr_leaf_setflag(args);
+ if (error)
+ goto out;
+ error = xfs_attr_rmtval_remove(args);
+ if (error)
+ goto out;
+
+ /*
+ * Refill the state structure with buffers, the prior calls
+ * released our buffers.
+ */
+ error = xfs_attr_refillstate(state);
+ if (error)
+ goto out;
+ }
+
+ /*
+ * Remove the name and update the hashvals in the tree.
+ */
+ blk = &state->path.blk[ state->path.active-1 ];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+ retval = xfs_attr_leaf_remove(blk->bp, args);
+ xfs_da_fixhashpath(state, &state->path);
+
+ /*
+ * Check to see if the tree needs to be collapsed.
+ */
+ if (retval && (state->path.active > 1)) {
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_da_join(state);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ goto out;
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
+
+ /*
+ * Commit the Btree join operation and start a new trans.
+ */
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
+ goto out;
+ }
+
+ /*
+ * If the result is small enough, push it all into the inode.
+ */
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ /*
+ * Have to get rid of the copy of this dabuf in the state.
+ */
+ ASSERT(state->path.active == 1);
+ ASSERT(state->path.blk[0].bp);
+ xfs_da_buf_done(state->path.blk[0].bp);
+ state->path.blk[0].bp = NULL;
+
+ error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error)
+ goto out;
+ ASSERT(be16_to_cpu(((xfs_attr_leafblock_t *)
+ bp->data)->hdr.info.magic)
+ == XFS_ATTR_LEAF_MAGIC);
+
+ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans,
+ args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ goto out;
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans
+ * and started a new one. We need the inode to be
+ * in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
+ } else
+ xfs_da_brelse(args->trans, bp);
+ }
+ error = 0;
+
+out:
+ xfs_da_state_free(state);
+ return(error);
+}
+
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+STATIC int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level;
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_da_blkno(blk->bp);
+ xfs_da_buf_done(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_da_blkno(blk->bp);
+ xfs_da_buf_done(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ return(0);
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+STATIC int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level, error;
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da_read_buf(state->args->trans,
+ state->args->dp,
+ blk->blkno, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da_read_buf(state->args->trans,
+ state->args->dp,
+ blk->blkno, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ return(0);
+}
+
+/*
+ * Look up a filename in a node attribute list.
+ *
+ * This routine gets called for any attribute fork that has more than one
+ * block, ie: both true Btree attr lists and for single-leaf-blocks with
+ * "remote" values taking up more blocks.
+ */
+STATIC int
+xfs_attr_node_get(xfs_da_args_t *args)
+{
+ xfs_da_state_t *state;
+ xfs_da_state_blk_t *blk;
+ int error, retval;
+ int i;
+
+ state = xfs_da_state_alloc();
+ state->args = args;
+ state->mp = args->dp->i_mount;
+ state->blocksize = state->mp->m_sb.sb_blocksize;
+ state->node_ents = state->mp->m_attr_node_ents;
+
+ /*
+ * Search to see if name exists, and get back a pointer to it.
+ */
+ error = xfs_da_node_lookup_int(state, &retval);
+ if (error) {
+ retval = error;
+ } else if (retval == EEXIST) {
+ blk = &state->path.blk[ state->path.active-1 ];
+ ASSERT(blk->bp != NULL);
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+ /*
+ * Get the value, local or "remote"
+ */
+ retval = xfs_attr_leaf_getvalue(blk->bp, args);
+ if (!retval && (args->rmtblkno > 0)
+ && !(args->flags & ATTR_KERNOVAL)) {
+ retval = xfs_attr_rmtval_get(args);
+ }
+ }
+
+ /*
+ * If not in a transaction, we have to release all the buffers.
+ */
+ for (i = 0; i < state->path.active; i++) {
+ xfs_da_brelse(args->trans, state->path.blk[i].bp);
+ state->path.blk[i].bp = NULL;
+ }
+
+ xfs_da_state_free(state);
+ return(retval);
+}
+
+STATIC int /* error */
+xfs_attr_node_list(xfs_attr_list_context_t *context)
+{
+ attrlist_cursor_kern_t *cursor;
+ xfs_attr_leafblock_t *leaf;
+ xfs_da_intnode_t *node;
+ xfs_da_node_entry_t *btree;
+ int error, i;
+ xfs_dabuf_t *bp;
+
+ cursor = context->cursor;
+ cursor->initted = 1;
+
+ /*
+ * Do all sorts of validation on the passed-in cursor structure.
+ * If anything is amiss, ignore the cursor and look up the hashval
+ * starting from the btree root.
+ */
+ bp = NULL;
+ if (cursor->blkno > 0) {
+ error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+ &bp, XFS_ATTR_FORK);
+ if ((error != 0) && (error != EFSCORRUPTED))
+ return(error);
+ if (bp) {
+ node = bp->data;
+ switch (be16_to_cpu(node->hdr.info.magic)) {
+ case XFS_DA_NODE_MAGIC:
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_da_brelse(NULL, bp);
+ bp = NULL;
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ leaf = bp->data;
+ if (cursor->hashval > be32_to_cpu(leaf->entries[
+ be16_to_cpu(leaf->hdr.count)-1].hashval)) {
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_da_brelse(NULL, bp);
+ bp = NULL;
+ } else if (cursor->hashval <=
+ be32_to_cpu(leaf->entries[0].hashval)) {
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_da_brelse(NULL, bp);
+ bp = NULL;
+ }
+ break;
+ default:
+ trace_xfs_attr_list_wrong_blk(context);
+ xfs_da_brelse(NULL, bp);
+ bp = NULL;
+ }
+ }
+ }
+
+ /*
+ * We did not find what we expected given the cursor's contents,
+ * so we start from the top and work down based on the hash value.
+ * Note that start of node block is same as start of leaf block.
+ */
+ if (bp == NULL) {
+ cursor->blkno = 0;
+ for (;;) {
+ error = xfs_da_read_buf(NULL, context->dp,
+ cursor->blkno, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ if (unlikely(bp == NULL)) {
+ XFS_ERROR_REPORT("xfs_attr_node_list(2)",
+ XFS_ERRLEVEL_LOW,
+ context->dp->i_mount);
+ return(XFS_ERROR(EFSCORRUPTED));
+ }
+ node = bp->data;
+ if (be16_to_cpu(node->hdr.info.magic)
+ == XFS_ATTR_LEAF_MAGIC)
+ break;
+ if (unlikely(be16_to_cpu(node->hdr.info.magic)
+ != XFS_DA_NODE_MAGIC)) {
+ XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
+ XFS_ERRLEVEL_LOW,
+ context->dp->i_mount,
+ node);
+ xfs_da_brelse(NULL, bp);
+ return(XFS_ERROR(EFSCORRUPTED));
+ }
+ btree = node->btree;
+ for (i = 0; i < be16_to_cpu(node->hdr.count);
+ btree++, i++) {
+ if (cursor->hashval
+ <= be32_to_cpu(btree->hashval)) {
+ cursor->blkno = be32_to_cpu(btree->before);
+ trace_xfs_attr_list_node_descend(context,
+ btree);
+ break;
+ }
+ }
+ if (i == be16_to_cpu(node->hdr.count)) {
+ xfs_da_brelse(NULL, bp);
+ return(0);
+ }
+ xfs_da_brelse(NULL, bp);
+ }
+ }
+ ASSERT(bp != NULL);
+
+ /*
+ * Roll upward through the blocks, processing each leaf block in
+ * order. As long as there is space in the result buffer, keep
+ * adding the information.
+ */
+ for (;;) {
+ leaf = bp->data;
+ if (unlikely(be16_to_cpu(leaf->hdr.info.magic)
+ != XFS_ATTR_LEAF_MAGIC)) {
+ XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
+ XFS_ERRLEVEL_LOW,
+ context->dp->i_mount, leaf);
+ xfs_da_brelse(NULL, bp);
+ return(XFS_ERROR(EFSCORRUPTED));
+ }
+ error = xfs_attr_leaf_list_int(bp, context);
+ if (error) {
+ xfs_da_brelse(NULL, bp);
+ return error;
+ }
+ if (context->seen_enough || leaf->hdr.info.forw == 0)
+ break;
+ cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
+ xfs_da_brelse(NULL, bp);
+ error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+ &bp, XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ if (unlikely((bp == NULL))) {
+ XFS_ERROR_REPORT("xfs_attr_node_list(5)",
+ XFS_ERRLEVEL_LOW,
+ context->dp->i_mount);
+ return(XFS_ERROR(EFSCORRUPTED));
+ }
+ }
+ xfs_da_brelse(NULL, bp);
+ return(0);
+}
+
+
+/*========================================================================
+ * External routines for manipulating out-of-line attribute values.
+ *========================================================================*/
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+int
+xfs_attr_rmtval_get(xfs_da_args_t *args)
+{
+ xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
+ xfs_mount_t *mp;
+ xfs_daddr_t dblkno;
+ void *dst;
+ xfs_buf_t *bp;
+ int nmap, error, tmp, valuelen, blkcnt, i;
+ xfs_dablk_t lblkno;
+
+ ASSERT(!(args->flags & ATTR_KERNOVAL));
+
+ mp = args->dp->i_mount;
+ dst = args->value;
+ valuelen = args->valuelen;
+ lblkno = args->rmtblkno;
+ while (valuelen > 0) {
+ nmap = ATTR_RMTVALUE_MAPSIZE;
+ error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
+ args->rmtblkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+ NULL, 0, map, &nmap, NULL);
+ if (error)
+ return(error);
+ ASSERT(nmap >= 1);
+
+ for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+ ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+ (map[i].br_startblock != HOLESTARTBLOCK));
+ dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+ blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+ error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
+ blkcnt, XBF_LOCK | XBF_DONT_BLOCK,
+ &bp);
+ if (error)
+ return(error);
+
+ tmp = (valuelen < XFS_BUF_SIZE(bp))
+ ? valuelen : XFS_BUF_SIZE(bp);
+ xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
+ xfs_buf_relse(bp);
+ dst += tmp;
+ valuelen -= tmp;
+
+ lblkno += map[i].br_blockcount;
+ }
+ }
+ ASSERT(valuelen == 0);
+ return(0);
+}
+
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+STATIC int
+xfs_attr_rmtval_set(xfs_da_args_t *args)
+{
+ xfs_mount_t *mp;
+ xfs_fileoff_t lfileoff;
+ xfs_inode_t *dp;
+ xfs_bmbt_irec_t map;
+ xfs_daddr_t dblkno;
+ void *src;
+ xfs_buf_t *bp;
+ xfs_dablk_t lblkno;
+ int blkcnt, valuelen, nmap, error, tmp, committed;
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ src = args->value;
+
+ /*
+ * Find a "hole" in the attribute address space large enough for
+ * us to drop the new attribute's value into.
+ */
+ blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+ lfileoff = 0;
+ error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+ XFS_ATTR_FORK);
+ if (error) {
+ return(error);
+ }
+ args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+ args->rmtblkcnt = blkcnt;
+
+ /*
+ * Roll through the "value", allocating blocks on disk as required.
+ */
+ while (blkcnt > 0) {
+ /*
+ * Allocate a single extent, up to the size of the value.
+ */
+ xfs_bmap_init(args->flist, args->firstblock);
+ nmap = 1;
+ error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
+ blkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
+ XFS_BMAPI_WRITE,
+ args->firstblock, args->total, &map, &nmap,
+ args->flist);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ return(error);
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, dp);
+
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+
+ /*
+ * Start the next trans in the chain.
+ */
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * Roll through the "value", copying the attribute value to the
+ * already-allocated blocks. Blocks are written synchronously
+ * so that we can know they are all on disk before we turn off
+ * the INCOMPLETE flag.
+ */
+ lblkno = args->rmtblkno;
+ valuelen = args->valuelen;
+ while (valuelen > 0) {
+ /*
+ * Try to remember where we decided to put the value.
+ */
+ xfs_bmap_init(args->flist, args->firstblock);
+ nmap = 1;
+ error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
+ args->rmtblkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+ args->firstblock, 0, &map, &nmap,
+ NULL);
+ if (error) {
+ return(error);
+ }
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+ bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
+ XBF_LOCK | XBF_DONT_BLOCK);
+ ASSERT(bp);
+ ASSERT(!XFS_BUF_GETERROR(bp));
+
+ tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
+ XFS_BUF_SIZE(bp);
+ xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
+ if (tmp < XFS_BUF_SIZE(bp))
+ xfs_buf_zero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+ if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
+ return (error);
+ }
+ src += tmp;
+ valuelen -= tmp;
+
+ lblkno += map.br_blockcount;
+ }
+ ASSERT(valuelen == 0);
+ return(0);
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+STATIC int
+xfs_attr_rmtval_remove(xfs_da_args_t *args)
+{
+ xfs_mount_t *mp;
+ xfs_bmbt_irec_t map;
+ xfs_buf_t *bp;
+ xfs_daddr_t dblkno;
+ xfs_dablk_t lblkno;
+ int valuelen, blkcnt, nmap, error, done, committed;
+
+ mp = args->dp->i_mount;
+
+ /*
+ * Roll through the "value", invalidating the attribute value's
+ * blocks.
+ */
+ lblkno = args->rmtblkno;
+ valuelen = args->rmtblkcnt;
+ while (valuelen > 0) {
+ /*
+ * Try to remember where we decided to put the value.
+ */
+ xfs_bmap_init(args->flist, args->firstblock);
+ nmap = 1;
+ error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
+ args->rmtblkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+ args->firstblock, 0, &map, &nmap,
+ args->flist);
+ if (error) {
+ return(error);
+ }
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+ /*
+ * If the "remote" value is in the cache, remove it.
+ */
+ bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
+ if (bp) {
+ XFS_BUF_STALE(bp);
+ XFS_BUF_UNDELAYWRITE(bp);
+ xfs_buf_relse(bp);
+ bp = NULL;
+ }
+
+ valuelen -= map.br_blockcount;
+
+ lblkno += map.br_blockcount;
+ }
+
+ /*
+ * Keep de-allocating extents until the remote-value region is gone.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ done = 0;
+ while (!done) {
+ xfs_bmap_init(args->flist, args->firstblock);
+ error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+ 1, args->firstblock, args->flist,
+ &done);
+ if (!error) {
+ error = xfs_bmap_finish(&args->trans, args->flist,
+ &committed);
+ }
+ if (error) {
+ ASSERT(committed);
+ args->trans = NULL;
+ xfs_bmap_cancel(args->flist);
+ return(error);
+ }
+
+ /*
+ * bmap_finish() may have committed the last trans and started
+ * a new one. We need the inode to be in all transactions.
+ */
+ if (committed)
+ xfs_trans_ijoin(args->trans, args->dp);
+
+ /*
+ * Close out trans and start the next one in the chain.
+ */
+ error = xfs_trans_roll(&args->trans, args->dp);
+ if (error)
+ return (error);
+ }
+ return(0);
+}
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
new file mode 100644
index 00000000..e920d68e
--- /dev/null
+++ b/fs/xfs/xfs_attr.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ATTR_H__
+#define __XFS_ATTR_H__
+
+struct xfs_inode;
+struct xfs_da_args;
+struct xfs_attr_list_context;
+
+/*
+ * Large attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes. Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree. Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.
+ * The internal links in the Btree are logical block offsets into the file.
+ *
+ * Small attribute lists use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+
+/*========================================================================
+ * External interfaces
+ *========================================================================*/
+
+
+#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */
+#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */
+#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */
+#define ATTR_SECURE 0x0008 /* use attrs in security namespace */
+#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */
+#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */
+
+#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */
+#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
+
+#define XFS_ATTR_FLAGS \
+ { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
+ { ATTR_ROOT, "ROOT" }, \
+ { ATTR_TRUST, "TRUST" }, \
+ { ATTR_SECURE, "SECURE" }, \
+ { ATTR_CREATE, "CREATE" }, \
+ { ATTR_REPLACE, "REPLACE" }, \
+ { ATTR_KERNOTIME, "KERNOTIME" }, \
+ { ATTR_KERNOVAL, "KERNOVAL" }
+
+/*
+ * The maximum size (into the kernel or returned from the kernel) of an
+ * attribute value or the buffer used for an attr_list() call. Larger
+ * sizes will result in an ERANGE return code.
+ */
+#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */
+
+/*
+ * Define how lists of attribute names are returned to the user from
+ * the attr_list() call. A large, 32bit aligned, buffer is passed in
+ * along with its size. We put an array of offsets at the top that each
+ * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
+ */
+typedef struct attrlist {
+ __s32 al_count; /* number of entries in attrlist */
+ __s32 al_more; /* T/F: more attrs (do call again) */
+ __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
+} attrlist_t;
+
+/*
+ * Show the interesting info about one attribute. This is what the
+ * al_offset[i] entry points to.
+ */
+typedef struct attrlist_ent { /* data from attr_list() */
+ __u32 a_valuelen; /* number bytes in value of attr */
+ char a_name[1]; /* attr name (NULL terminated) */
+} attrlist_ent_t;
+
+/*
+ * Given a pointer to the (char*) buffer containing the attr_list() result,
+ * and an index, return a pointer to the indicated attribute in the buffer.
+ */
+#define ATTR_ENTRY(buffer, index) \
+ ((attrlist_ent_t *) \
+ &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
+
+/*
+ * Kernel-internal version of the attrlist cursor.
+ */
+typedef struct attrlist_cursor_kern {
+ __u32 hashval; /* hash value of next entry to add */
+ __u32 blkno; /* block containing entry (suggestion) */
+ __u32 offset; /* offset in list of equal-hashvals */
+ __u16 pad1; /* padding to match user-level */
+ __u8 pad2; /* padding to match user-level */
+ __u8 initted; /* T/F: cursor has been initialized */
+} attrlist_cursor_kern_t;
+
+
+/*========================================================================
+ * Structure used to pass context around among the routines.
+ *========================================================================*/
+
+
+typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+ unsigned char *, int, int, unsigned char *);
+
+typedef struct xfs_attr_list_context {
+ struct xfs_inode *dp; /* inode */
+ struct attrlist_cursor_kern *cursor; /* position in list */
+ char *alist; /* output buffer */
+ int seen_enough; /* T/F: seen enough of list? */
+ ssize_t count; /* num used entries */
+ int dupcnt; /* count dup hashvals seen */
+ int bufsize; /* total buffer size */
+ int firstu; /* first used byte in buffer */
+ int flags; /* from VOP call */
+ int resynch; /* T/F: resynch with cursor */
+ int put_value; /* T/F: need value for listent */
+ put_listent_func_t put_listent; /* list output fmt function */
+ int index; /* index into output buffer */
+} xfs_attr_list_context_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Overall external interface routines.
+ */
+int xfs_attr_inactive(struct xfs_inode *dp);
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_list_int(struct xfs_attr_list_context *);
+
+#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
new file mode 100644
index 00000000..f49ecf2e
--- /dev/null
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -0,0 +1,2979 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * xfs_attr_leaf.c
+ *
+ * Routines to implement leaf blocks of attributes as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
+ xfs_dabuf_t **bpp);
+STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
+ int freemap_index);
+STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer);
+STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *blk1,
+ xfs_da_state_blk_t *blk2);
+STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *leaf_blk_1,
+ xfs_da_state_blk_t *leaf_blk_2,
+ int *number_entries_in_blk1,
+ int *number_usedbytes_in_blk1);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
+ xfs_dabuf_t *bp, int level);
+STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
+ xfs_dabuf_t *bp);
+STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
+ xfs_dablk_t blkno, int blkcnt);
+
+/*
+ * Utility routines.
+ */
+STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
+ int src_start,
+ xfs_attr_leafblock_t *dst_leaf,
+ int dst_start, int move_count,
+ xfs_mount_t *mp);
+STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+
+/*========================================================================
+ * Namespace helper routines
+ *========================================================================*/
+
+/*
+ * If namespace bits don't match return 0.
+ * If all match then return 1.
+ */
+STATIC int
+xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+{
+ return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+}
+
+
+/*========================================================================
+ * External routines when attribute fork size < XFS_LITINO(mp).
+ *========================================================================*/
+
+/*
+ * Query whether the requested number of additional bytes of extended
+ * attribute space will be able to fit inline.
+ *
+ * Returns zero if not, else the di_forkoff fork offset to be used in the
+ * literal area for attribute data once the new bytes have been added.
+ *
+ * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value;
+ * special case for dev/uuid inodes, they have fixed size data forks.
+ */
+int
+xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
+{
+ int offset;
+ int minforkoff; /* lower limit on valid forkoff locations */
+ int maxforkoff; /* upper limit on valid forkoff locations */
+ int dsize;
+ xfs_mount_t *mp = dp->i_mount;
+
+ offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */
+
+ switch (dp->i_d.di_format) {
+ case XFS_DINODE_FMT_DEV:
+ minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+ return (offset >= minforkoff) ? minforkoff : 0;
+ case XFS_DINODE_FMT_UUID:
+ minforkoff = roundup(sizeof(uuid_t), 8) >> 3;
+ return (offset >= minforkoff) ? minforkoff : 0;
+ }
+
+ /*
+ * If the requested numbers of bytes is smaller or equal to the
+ * current attribute fork size we can always proceed.
+ *
+ * Note that if_bytes in the data fork might actually be larger than
+ * the current data fork size is due to delalloc extents. In that
+ * case either the extent count will go down when they are converted
+ * to real extents, or the delalloc conversion will take care of the
+ * literal area rebalancing.
+ */
+ if (bytes <= XFS_IFORK_ASIZE(dp))
+ return dp->i_d.di_forkoff;
+
+ /*
+ * For attr2 we can try to move the forkoff if there is space in the
+ * literal area, but for the old format we are done if there is no
+ * space in the fixed attribute fork.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_ATTR2))
+ return 0;
+
+ dsize = dp->i_df.if_bytes;
+
+ switch (dp->i_d.di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ /*
+ * If there is no attr fork and the data fork is extents,
+ * determine if creating the default attr fork will result
+ * in the extents form migrating to btree. If so, the
+ * minimum offset only needs to be the space required for
+ * the btree root.
+ */
+ if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+ xfs_default_attroffset(dp))
+ dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ /*
+ * If we have a data btree then keep forkoff if we have one,
+ * otherwise we are adding a new attr, so then we set
+ * minforkoff to where the btree root can finish so we have
+ * plenty of room for attrs
+ */
+ if (dp->i_d.di_forkoff) {
+ if (offset < dp->i_d.di_forkoff)
+ return 0;
+ return dp->i_d.di_forkoff;
+ }
+ dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
+ break;
+ }
+
+ /*
+ * A data fork btree root must have space for at least
+ * MINDBTPTRS key/ptr pairs if the data fork is small or empty.
+ */
+ minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+ minforkoff = roundup(minforkoff, 8) >> 3;
+
+ /* attr fork btree root can have at least this many key/ptr pairs */
+ maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ maxforkoff = maxforkoff >> 3; /* rounded down */
+
+ if (offset >= maxforkoff)
+ return maxforkoff;
+ if (offset >= minforkoff)
+ return offset;
+ return 0;
+}
+
+/*
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2)
+ */
+STATIC void
+xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
+{
+ if ((mp->m_flags & XFS_MOUNT_ATTR2) &&
+ !(xfs_sb_version_hasattr2(&mp->m_sb))) {
+ spin_lock(&mp->m_sb_lock);
+ if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
+ xfs_sb_version_addattr2(&mp->m_sb);
+ spin_unlock(&mp->m_sb_lock);
+ xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+ } else
+ spin_unlock(&mp->m_sb_lock);
+ }
+}
+
+/*
+ * Create the initial contents of a shortform attribute list.
+ */
+void
+xfs_attr_shortform_create(xfs_da_args_t *args)
+{
+ xfs_attr_sf_hdr_t *hdr;
+ xfs_inode_t *dp;
+ xfs_ifork_t *ifp;
+
+ dp = args->dp;
+ ASSERT(dp != NULL);
+ ifp = dp->i_afp;
+ ASSERT(ifp != NULL);
+ ASSERT(ifp->if_bytes == 0);
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
+ ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */
+ dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
+ ifp->if_flags |= XFS_IFINLINE;
+ } else {
+ ASSERT(ifp->if_flags & XFS_IFINLINE);
+ }
+ xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
+ hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
+ hdr->count = 0;
+ hdr->totsize = cpu_to_be16(sizeof(*hdr));
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+}
+
+/*
+ * Add a name/value pair to the shortform attribute list.
+ * Overflow from the inode has already been checked for.
+ */
+void
+xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
+{
+ xfs_attr_shortform_t *sf;
+ xfs_attr_sf_entry_t *sfe;
+ int i, offset, size;
+ xfs_mount_t *mp;
+ xfs_inode_t *dp;
+ xfs_ifork_t *ifp;
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ dp->i_d.di_forkoff = forkoff;
+ dp->i_df.if_ext_max =
+ XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
+ dp->i_afp->if_ext_max =
+ XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
+
+ ifp = dp->i_afp;
+ ASSERT(ifp->if_flags & XFS_IFINLINE);
+ sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ sfe = &sf->list[0];
+ for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+#ifdef DEBUG
+ if (sfe->namelen != args->namelen)
+ continue;
+ if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+ continue;
+ if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+ continue;
+ ASSERT(0);
+#endif
+ }
+
+ offset = (char *)sfe - (char *)sf;
+ size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+ xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+ sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+
+ sfe->namelen = args->namelen;
+ sfe->valuelen = args->valuelen;
+ sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+ memcpy(sfe->nameval, args->name, args->namelen);
+ memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
+ sf->hdr.count++;
+ be16_add_cpu(&sf->hdr.totsize, size);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+ xfs_sbversion_add_attr2(mp, args->trans);
+}
+
+/*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+STATIC void
+xfs_attr_fork_reset(
+ struct xfs_inode *ip,
+ struct xfs_trans *tp)
+{
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+ ip->i_d.di_forkoff = 0;
+ ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+ ASSERT(ip->i_d.di_anextents == 0);
+ ASSERT(ip->i_afp == NULL);
+
+ ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/*
+ * Remove an attribute from the shortform attribute list structure.
+ */
+int
+xfs_attr_shortform_remove(xfs_da_args_t *args)
+{
+ xfs_attr_shortform_t *sf;
+ xfs_attr_sf_entry_t *sfe;
+ int base, size=0, end, totsize, i;
+ xfs_mount_t *mp;
+ xfs_inode_t *dp;
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ base = sizeof(xfs_attr_sf_hdr_t);
+ sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+ sfe = &sf->list[0];
+ end = sf->hdr.count;
+ for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+ base += size, i++) {
+ size = XFS_ATTR_SF_ENTSIZE(sfe);
+ if (sfe->namelen != args->namelen)
+ continue;
+ if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
+ continue;
+ if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+ continue;
+ break;
+ }
+ if (i == end)
+ return(XFS_ERROR(ENOATTR));
+
+ /*
+ * Fix up the attribute fork data, covering the hole
+ */
+ end = base + size;
+ totsize = be16_to_cpu(sf->hdr.totsize);
+ if (end != totsize)
+ memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
+ sf->hdr.count--;
+ be16_add_cpu(&sf->hdr.totsize, -size);
+
+ /*
+ * Fix up the start offset of the attribute fork
+ */
+ totsize -= size;
+ if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
+ (mp->m_flags & XFS_MOUNT_ATTR2) &&
+ (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+ !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+ xfs_attr_fork_reset(dp, args->trans);
+ } else {
+ xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+ dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
+ ASSERT(dp->i_d.di_forkoff);
+ ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+ (args->op_flags & XFS_DA_OP_ADDNAME) ||
+ !(mp->m_flags & XFS_MOUNT_ATTR2) ||
+ dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+ dp->i_afp->if_ext_max =
+ XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
+ dp->i_df.if_ext_max =
+ XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
+ xfs_trans_log_inode(args->trans, dp,
+ XFS_ILOG_CORE | XFS_ILOG_ADATA);
+ }
+
+ xfs_sbversion_add_attr2(mp, args->trans);
+
+ return(0);
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_lookup(xfs_da_args_t *args)
+{
+ xfs_attr_shortform_t *sf;
+ xfs_attr_sf_entry_t *sfe;
+ int i;
+ xfs_ifork_t *ifp;
+
+ ifp = args->dp->i_afp;
+ ASSERT(ifp->if_flags & XFS_IFINLINE);
+ sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ sfe = &sf->list[0];
+ for (i = 0; i < sf->hdr.count;
+ sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+ if (sfe->namelen != args->namelen)
+ continue;
+ if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+ continue;
+ if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+ continue;
+ return(XFS_ERROR(EEXIST));
+ }
+ return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+{
+ xfs_attr_shortform_t *sf;
+ xfs_attr_sf_entry_t *sfe;
+ int i;
+
+ ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE);
+ sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+ sfe = &sf->list[0];
+ for (i = 0; i < sf->hdr.count;
+ sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+ if (sfe->namelen != args->namelen)
+ continue;
+ if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+ continue;
+ if (!xfs_attr_namesp_match(args->flags, sfe->flags))
+ continue;
+ if (args->flags & ATTR_KERNOVAL) {
+ args->valuelen = sfe->valuelen;
+ return(XFS_ERROR(EEXIST));
+ }
+ if (args->valuelen < sfe->valuelen) {
+ args->valuelen = sfe->valuelen;
+ return(XFS_ERROR(ERANGE));
+ }
+ args->valuelen = sfe->valuelen;
+ memcpy(args->value, &sfe->nameval[args->namelen],
+ args->valuelen);
+ return(XFS_ERROR(EEXIST));
+ }
+ return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+{
+ xfs_inode_t *dp;
+ xfs_attr_shortform_t *sf;
+ xfs_attr_sf_entry_t *sfe;
+ xfs_da_args_t nargs;
+ char *tmpbuffer;
+ int error, i, size;
+ xfs_dablk_t blkno;
+ xfs_dabuf_t *bp;
+ xfs_ifork_t *ifp;
+
+ dp = args->dp;
+ ifp = dp->i_afp;
+ sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ size = be16_to_cpu(sf->hdr.totsize);
+ tmpbuffer = kmem_alloc(size, KM_SLEEP);
+ ASSERT(tmpbuffer != NULL);
+ memcpy(tmpbuffer, ifp->if_u1.if_data, size);
+ sf = (xfs_attr_shortform_t *)tmpbuffer;
+
+ xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+ bp = NULL;
+ error = xfs_da_grow_inode(args, &blkno);
+ if (error) {
+ /*
+ * If we hit an IO error middle of the transaction inside
+ * grow_inode(), we may have inconsistent data. Bail out.
+ */
+ if (error == EIO)
+ goto out;
+ xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
+ memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ goto out;
+ }
+
+ ASSERT(blkno == 0);
+ error = xfs_attr_leaf_create(args, blkno, &bp);
+ if (error) {
+ error = xfs_da_shrink_inode(args, 0, bp);
+ bp = NULL;
+ if (error)
+ goto out;
+ xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
+ memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ goto out;
+ }
+
+ memset((char *)&nargs, 0, sizeof(nargs));
+ nargs.dp = dp;
+ nargs.firstblock = args->firstblock;
+ nargs.flist = args->flist;
+ nargs.total = args->total;
+ nargs.whichfork = XFS_ATTR_FORK;
+ nargs.trans = args->trans;
+ nargs.op_flags = XFS_DA_OP_OKNOENT;
+
+ sfe = &sf->list[0];
+ for (i = 0; i < sf->hdr.count; i++) {
+ nargs.name = sfe->nameval;
+ nargs.namelen = sfe->namelen;
+ nargs.value = &sfe->nameval[nargs.namelen];
+ nargs.valuelen = sfe->valuelen;
+ nargs.hashval = xfs_da_hashname(sfe->nameval,
+ sfe->namelen);
+ nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
+ error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
+ ASSERT(error == ENOATTR);
+ error = xfs_attr_leaf_add(bp, &nargs);
+ ASSERT(error != ENOSPC);
+ if (error)
+ goto out;
+ sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ }
+ error = 0;
+
+out:
+ if(bp)
+ xfs_da_buf_done(bp);
+ kmem_free(tmpbuffer);
+ return(error);
+}
+
+STATIC int
+xfs_attr_shortform_compare(const void *a, const void *b)
+{
+ xfs_attr_sf_sort_t *sa, *sb;
+
+ sa = (xfs_attr_sf_sort_t *)a;
+ sb = (xfs_attr_sf_sort_t *)b;
+ if (sa->hash < sb->hash) {
+ return(-1);
+ } else if (sa->hash > sb->hash) {
+ return(1);
+ } else {
+ return(sa->entno - sb->entno);
+ }
+}
+
+
+#define XFS_ISRESET_CURSOR(cursor) \
+ (!((cursor)->initted) && !((cursor)->hashval) && \
+ !((cursor)->blkno) && !((cursor)->offset))
+/*
+ * Copy out entries of shortform attribute lists for attr_list().
+ * Shortform attribute lists are not stored in hashval sorted order.
+ * If the output buffer is not large enough to hold them all, then we
+ * we have to calculate each entries' hashvalue and sort them before
+ * we can begin returning them to the user.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+{
+ attrlist_cursor_kern_t *cursor;
+ xfs_attr_sf_sort_t *sbuf, *sbp;
+ xfs_attr_shortform_t *sf;
+ xfs_attr_sf_entry_t *sfe;
+ xfs_inode_t *dp;
+ int sbsize, nsbuf, count, i;
+ int error;
+
+ ASSERT(context != NULL);
+ dp = context->dp;
+ ASSERT(dp != NULL);
+ ASSERT(dp->i_afp != NULL);
+ sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+ ASSERT(sf != NULL);
+ if (!sf->hdr.count)
+ return(0);
+ cursor = context->cursor;
+ ASSERT(cursor != NULL);
+
+ trace_xfs_attr_list_sf(context);
+
+ /*
+ * If the buffer is large enough and the cursor is at the start,
+ * do not bother with sorting since we will return everything in
+ * one buffer and another call using the cursor won't need to be
+ * made.
+ * Note the generous fudge factor of 16 overhead bytes per entry.
+ * If bufsize is zero then put_listent must be a search function
+ * and can just scan through what we have.
+ */
+ if (context->bufsize == 0 ||
+ (XFS_ISRESET_CURSOR(cursor) &&
+ (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+ for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ error = context->put_listent(context,
+ sfe->flags,
+ sfe->nameval,
+ (int)sfe->namelen,
+ (int)sfe->valuelen,
+ &sfe->nameval[sfe->namelen]);
+
+ /*
+ * Either search callback finished early or
+ * didn't fit it all in the buffer after all.
+ */
+ if (context->seen_enough)
+ break;
+
+ if (error)
+ return error;
+ sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ }
+ trace_xfs_attr_list_sf_all(context);
+ return(0);
+ }
+
+ /* do no more for a search callback */
+ if (context->bufsize == 0)
+ return 0;
+
+ /*
+ * It didn't all fit, so we have to sort everything on hashval.
+ */
+ sbsize = sf->hdr.count * sizeof(*sbuf);
+ sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+
+ /*
+ * Scan the attribute list for the rest of the entries, storing
+ * the relevant info from only those that match into a buffer.
+ */
+ nsbuf = 0;
+ for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ if (unlikely(
+ ((char *)sfe < (char *)sf) ||
+ ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+ XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
+ XFS_ERRLEVEL_LOW,
+ context->dp->i_mount, sfe);
+ kmem_free(sbuf);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ sbp->entno = i;
+ sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
+ sbp->name = sfe->nameval;
+ sbp->namelen = sfe->namelen;
+ /* These are bytes, and both on-disk, don't endian-flip */
+ sbp->valuelen = sfe->valuelen;
+ sbp->flags = sfe->flags;
+ sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sbp++;
+ nsbuf++;
+ }
+
+ /*
+ * Sort the entries on hash then entno.
+ */
+ xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
+
+ /*
+ * Re-find our place IN THE SORTED LIST.
+ */
+ count = 0;
+ cursor->initted = 1;
+ cursor->blkno = 0;
+ for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
+ if (sbp->hash == cursor->hashval) {
+ if (cursor->offset == count) {
+ break;
+ }
+ count++;
+ } else if (sbp->hash > cursor->hashval) {
+ break;
+ }
+ }
+ if (i == nsbuf) {
+ kmem_free(sbuf);
+ return(0);
+ }
+
+ /*
+ * Loop putting entries into the user buffer.
+ */
+ for ( ; i < nsbuf; i++, sbp++) {
+ if (cursor->hashval != sbp->hash) {
+ cursor->hashval = sbp->hash;
+ cursor->offset = 0;
+ }
+ error = context->put_listent(context,
+ sbp->flags,
+ sbp->name,
+ sbp->namelen,
+ sbp->valuelen,
+ &sbp->name[sbp->namelen]);
+ if (error)
+ return error;
+ if (context->seen_enough)
+ break;
+ cursor->offset++;
+ }
+
+ kmem_free(sbuf);
+ return(0);
+}
+
+/*
+ * Check a leaf attribute block to see if all the entries would fit into
+ * a shortform attribute list.
+ */
+int
+xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_entry_t *entry;
+ xfs_attr_leaf_name_local_t *name_loc;
+ int bytes, i;
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+
+ entry = &leaf->entries[0];
+ bytes = sizeof(struct xfs_attr_sf_hdr);
+ for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+ if (entry->flags & XFS_ATTR_INCOMPLETE)
+ continue; /* don't copy partial entries */
+ if (!(entry->flags & XFS_ATTR_LOCAL))
+ return(0);
+ name_loc = xfs_attr_leaf_name_local(leaf, i);
+ if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+ return(0);
+ if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
+ return(0);
+ bytes += sizeof(struct xfs_attr_sf_entry)-1
+ + name_loc->namelen
+ + be16_to_cpu(name_loc->valuelen);
+ }
+ if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
+ (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+ (bytes == sizeof(struct xfs_attr_sf_hdr)))
+ return(-1);
+ return(xfs_attr_shortform_bytesfit(dp, bytes));
+}
+
+/*
+ * Convert a leaf attribute list to shortform attribute list
+ */
+int
+xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_entry_t *entry;
+ xfs_attr_leaf_name_local_t *name_loc;
+ xfs_da_args_t nargs;
+ xfs_inode_t *dp;
+ char *tmpbuffer;
+ int error, i;
+
+ dp = args->dp;
+ tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
+ ASSERT(tmpbuffer != NULL);
+
+ ASSERT(bp != NULL);
+ memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
+ leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
+
+ /*
+ * Clean out the prior contents of the attribute list.
+ */
+ error = xfs_da_shrink_inode(args, 0, bp);
+ if (error)
+ goto out;
+
+ if (forkoff == -1) {
+ ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
+ ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
+ xfs_attr_fork_reset(dp, args->trans);
+ goto out;
+ }
+
+ xfs_attr_shortform_create(args);
+
+ /*
+ * Copy the attributes
+ */
+ memset((char *)&nargs, 0, sizeof(nargs));
+ nargs.dp = dp;
+ nargs.firstblock = args->firstblock;
+ nargs.flist = args->flist;
+ nargs.total = args->total;
+ nargs.whichfork = XFS_ATTR_FORK;
+ nargs.trans = args->trans;
+ nargs.op_flags = XFS_DA_OP_OKNOENT;
+ entry = &leaf->entries[0];
+ for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+ if (entry->flags & XFS_ATTR_INCOMPLETE)
+ continue; /* don't copy partial entries */
+ if (!entry->nameidx)
+ continue;
+ ASSERT(entry->flags & XFS_ATTR_LOCAL);
+ name_loc = xfs_attr_leaf_name_local(leaf, i);
+ nargs.name = name_loc->nameval;
+ nargs.namelen = name_loc->namelen;
+ nargs.value = &name_loc->nameval[nargs.namelen];
+ nargs.valuelen = be16_to_cpu(name_loc->valuelen);
+ nargs.hashval = be32_to_cpu(entry->hashval);
+ nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
+ xfs_attr_shortform_add(&nargs, forkoff);
+ }
+ error = 0;
+
+out:
+ kmem_free(tmpbuffer);
+ return(error);
+}
+
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_attr_leaf_to_node(xfs_da_args_t *args)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_da_intnode_t *node;
+ xfs_inode_t *dp;
+ xfs_dabuf_t *bp1, *bp2;
+ xfs_dablk_t blkno;
+ int error;
+
+ dp = args->dp;
+ bp1 = bp2 = NULL;
+ error = xfs_da_grow_inode(args, &blkno);
+ if (error)
+ goto out;
+ error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
+ XFS_ATTR_FORK);
+ if (error)
+ goto out;
+ ASSERT(bp1 != NULL);
+ bp2 = NULL;
+ error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
+ XFS_ATTR_FORK);
+ if (error)
+ goto out;
+ ASSERT(bp2 != NULL);
+ memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
+ xfs_da_buf_done(bp1);
+ bp1 = NULL;
+ xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+ /*
+ * Set up the new root node.
+ */
+ error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+ if (error)
+ goto out;
+ node = bp1->data;
+ leaf = bp2->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ /* both on-disk, don't endian-flip twice */
+ node->btree[0].hashval =
+ leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval;
+ node->btree[0].before = cpu_to_be32(blkno);
+ node->hdr.count = cpu_to_be16(1);
+ xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
+ error = 0;
+out:
+ if (bp1)
+ xfs_da_buf_done(bp1);
+ if (bp2)
+ xfs_da_buf_done(bp2);
+ return(error);
+}
+
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a leaf attribute list
+ * or a leaf in a node attribute list.
+ */
+STATIC int
+xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_hdr_t *hdr;
+ xfs_inode_t *dp;
+ xfs_dabuf_t *bp;
+ int error;
+
+ dp = args->dp;
+ ASSERT(dp != NULL);
+ error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+ leaf = bp->data;
+ memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
+ hdr = &leaf->hdr;
+ hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
+ hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount));
+ if (!hdr->firstused) {
+ hdr->firstused = cpu_to_be16(
+ XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN);
+ }
+
+ hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
+ hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) -
+ sizeof(xfs_attr_leaf_hdr_t));
+
+ xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+ *bpp = bp;
+ return(0);
+}
+
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+ xfs_da_state_blk_t *newblk)
+{
+ xfs_dablk_t blkno;
+ int error;
+
+ /*
+ * Allocate space for a new leaf node.
+ */
+ ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
+ error = xfs_da_grow_inode(state->args, &blkno);
+ if (error)
+ return(error);
+ error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp);
+ if (error)
+ return(error);
+ newblk->blkno = blkno;
+ newblk->magic = XFS_ATTR_LEAF_MAGIC;
+
+ /*
+ * Rebalance the entries across the two leaves.
+ * NOTE: rebalance() currently depends on the 2nd block being empty.
+ */
+ xfs_attr_leaf_rebalance(state, oldblk, newblk);
+ error = xfs_da_blk_link(state, oldblk, newblk);
+ if (error)
+ return(error);
+
+ /*
+ * Save info on "old" attribute for "atomic rename" ops, leaf_add()
+ * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
+ * "new" attrs info. Will need the "old" info to remove it later.
+ *
+ * Insert the "new" entry in the correct block.
+ */
+ if (state->inleaf)
+ error = xfs_attr_leaf_add(oldblk->bp, state->args);
+ else
+ error = xfs_attr_leaf_add(newblk->bp, state->args);
+
+ /*
+ * Update last hashval in each block since we added the name.
+ */
+ oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
+ newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
+ return(error);
+}
+
+/*
+ * Add a name to the leaf attribute list structure.
+ */
+int
+xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_hdr_t *hdr;
+ xfs_attr_leaf_map_t *map;
+ int tablesize, entsize, sum, tmp, i;
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ ASSERT((args->index >= 0)
+ && (args->index <= be16_to_cpu(leaf->hdr.count)));
+ hdr = &leaf->hdr;
+ entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
+ args->trans->t_mountp->m_sb.sb_blocksize, NULL);
+
+ /*
+ * Search through freemap for first-fit on new name length.
+ * (may need to figure in size of entry struct too)
+ */
+ tablesize = (be16_to_cpu(hdr->count) + 1)
+ * sizeof(xfs_attr_leaf_entry_t)
+ + sizeof(xfs_attr_leaf_hdr_t);
+ map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1];
+ for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
+ if (tablesize > be16_to_cpu(hdr->firstused)) {
+ sum += be16_to_cpu(map->size);
+ continue;
+ }
+ if (!map->size)
+ continue; /* no space in this map */
+ tmp = entsize;
+ if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused))
+ tmp += sizeof(xfs_attr_leaf_entry_t);
+ if (be16_to_cpu(map->size) >= tmp) {
+ tmp = xfs_attr_leaf_add_work(bp, args, i);
+ return(tmp);
+ }
+ sum += be16_to_cpu(map->size);
+ }
+
+ /*
+ * If there are no holes in the address space of the block,
+ * and we don't have enough freespace, then compaction will do us
+ * no good and we should just give up.
+ */
+ if (!hdr->holes && (sum < entsize))
+ return(XFS_ERROR(ENOSPC));
+
+ /*
+ * Compact the entries to coalesce free space.
+ * This may change the hdr->count via dropping INCOMPLETE entries.
+ */
+ xfs_attr_leaf_compact(args->trans, bp);
+
+ /*
+ * After compaction, the block is guaranteed to have only one
+ * free region, in freemap[0]. If it is not big enough, give up.
+ */
+ if (be16_to_cpu(hdr->freemap[0].size)
+ < (entsize + sizeof(xfs_attr_leaf_entry_t)))
+ return(XFS_ERROR(ENOSPC));
+
+ return(xfs_attr_leaf_add_work(bp, args, 0));
+}
+
+/*
+ * Add a name to a leaf attribute list structure.
+ */
+STATIC int
+xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_hdr_t *hdr;
+ xfs_attr_leaf_entry_t *entry;
+ xfs_attr_leaf_name_local_t *name_loc;
+ xfs_attr_leaf_name_remote_t *name_rmt;
+ xfs_attr_leaf_map_t *map;
+ xfs_mount_t *mp;
+ int tmp, i;
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ hdr = &leaf->hdr;
+ ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE));
+ ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count)));
+
+ /*
+ * Force open some space in the entry array and fill it in.
+ */
+ entry = &leaf->entries[args->index];
+ if (args->index < be16_to_cpu(hdr->count)) {
+ tmp = be16_to_cpu(hdr->count) - args->index;
+ tmp *= sizeof(xfs_attr_leaf_entry_t);
+ memmove((char *)(entry+1), (char *)entry, tmp);
+ xfs_da_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+ }
+ be16_add_cpu(&hdr->count, 1);
+
+ /*
+ * Allocate space for the new string (at the end of the run).
+ */
+ map = &hdr->freemap[mapindex];
+ mp = args->trans->t_mountp;
+ ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
+ ASSERT((be16_to_cpu(map->base) & 0x3) == 0);
+ ASSERT(be16_to_cpu(map->size) >=
+ xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
+ mp->m_sb.sb_blocksize, NULL));
+ ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
+ ASSERT((be16_to_cpu(map->size) & 0x3) == 0);
+ be16_add_cpu(&map->size,
+ -xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
+ mp->m_sb.sb_blocksize, &tmp));
+ entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) +
+ be16_to_cpu(map->size));
+ entry->hashval = cpu_to_be32(args->hashval);
+ entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
+ entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+ if (args->op_flags & XFS_DA_OP_RENAME) {
+ entry->flags |= XFS_ATTR_INCOMPLETE;
+ if ((args->blkno2 == args->blkno) &&
+ (args->index2 <= args->index)) {
+ args->index2++;
+ }
+ }
+ xfs_da_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+ ASSERT((args->index == 0) ||
+ (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
+ ASSERT((args->index == be16_to_cpu(hdr->count)-1) ||
+ (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
+
+ /*
+ * Copy the attribute name and value into the new space.
+ *
+ * For "remote" attribute values, simply note that we need to
+ * allocate space for the "remote" value. We can't actually
+ * allocate the extents in this transaction, and we can't decide
+ * which blocks they should be as we might allocate more blocks
+ * as part of this transaction (a split operation for example).
+ */
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+ name_loc->namelen = args->namelen;
+ name_loc->valuelen = cpu_to_be16(args->valuelen);
+ memcpy((char *)name_loc->nameval, args->name, args->namelen);
+ memcpy((char *)&name_loc->nameval[args->namelen], args->value,
+ be16_to_cpu(name_loc->valuelen));
+ } else {
+ name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt->namelen = args->namelen;
+ memcpy((char *)name_rmt->name, args->name, args->namelen);
+ entry->flags |= XFS_ATTR_INCOMPLETE;
+ /* just in case */
+ name_rmt->valuelen = 0;
+ name_rmt->valueblk = 0;
+ args->rmtblkno = 1;
+ args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+ }
+ xfs_da_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
+ xfs_attr_leaf_entsize(leaf, args->index)));
+
+ /*
+ * Update the control info for this leaf node
+ */
+ if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused)) {
+ /* both on-disk, don't endian-flip twice */
+ hdr->firstused = entry->nameidx;
+ }
+ ASSERT(be16_to_cpu(hdr->firstused) >=
+ ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr)));
+ tmp = (be16_to_cpu(hdr->count)-1) * sizeof(xfs_attr_leaf_entry_t)
+ + sizeof(xfs_attr_leaf_hdr_t);
+ map = &hdr->freemap[0];
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
+ if (be16_to_cpu(map->base) == tmp) {
+ be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t));
+ be16_add_cpu(&map->size,
+ -((int)sizeof(xfs_attr_leaf_entry_t)));
+ }
+ }
+ be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index));
+ xfs_da_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+ return(0);
+}
+
+/*
+ * Garbage collect a leaf attribute list block by copying it to a new buffer.
+ */
+STATIC void
+xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
+{
+ xfs_attr_leafblock_t *leaf_s, *leaf_d;
+ xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+ xfs_mount_t *mp;
+ char *tmpbuffer;
+
+ mp = trans->t_mountp;
+ tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
+ ASSERT(tmpbuffer != NULL);
+ memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp));
+ memset(bp->data, 0, XFS_LBSIZE(mp));
+
+ /*
+ * Copy basic information
+ */
+ leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
+ leaf_d = bp->data;
+ hdr_s = &leaf_s->hdr;
+ hdr_d = &leaf_d->hdr;
+ hdr_d->info = hdr_s->info; /* struct copy */
+ hdr_d->firstused = cpu_to_be16(XFS_LBSIZE(mp));
+ /* handle truncation gracefully */
+ if (!hdr_d->firstused) {
+ hdr_d->firstused = cpu_to_be16(
+ XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN);
+ }
+ hdr_d->usedbytes = 0;
+ hdr_d->count = 0;
+ hdr_d->holes = 0;
+ hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
+ hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) -
+ sizeof(xfs_attr_leaf_hdr_t));
+
+ /*
+ * Copy all entry's in the same (sorted) order,
+ * but allocate name/value pairs packed and in sequence.
+ */
+ xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0,
+ be16_to_cpu(hdr_s->count), mp);
+ xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
+
+ kmem_free(tmpbuffer);
+}
+
+/*
+ * Redistribute the attribute list entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of the
+ * old block. At present, all (one) callers pass in an empty second block.
+ *
+ * This code adjusts the args->index/blkno and args->index2/blkno2 fields
+ * to match what it is doing in splitting the attribute leaf block. Those
+ * values are used in "atomic rename" operations on attributes. Note that
+ * the "new" and "old" values can end up in different blocks.
+ */
+STATIC void
+xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+ xfs_da_state_blk_t *blk2)
+{
+ xfs_da_args_t *args;
+ xfs_da_state_blk_t *tmp_blk;
+ xfs_attr_leafblock_t *leaf1, *leaf2;
+ xfs_attr_leaf_hdr_t *hdr1, *hdr2;
+ int count, totallen, max, space, swap;
+
+ /*
+ * Set up environment.
+ */
+ ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
+ leaf1 = blk1->bp->data;
+ leaf2 = blk2->bp->data;
+ ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ args = state->args;
+
+ /*
+ * Check ordering of blocks, reverse if it makes things simpler.
+ *
+ * NOTE: Given that all (current) callers pass in an empty
+ * second block, this code should never set "swap".
+ */
+ swap = 0;
+ if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) {
+ tmp_blk = blk1;
+ blk1 = blk2;
+ blk2 = tmp_blk;
+ leaf1 = blk1->bp->data;
+ leaf2 = blk2->bp->data;
+ swap = 1;
+ }
+ hdr1 = &leaf1->hdr;
+ hdr2 = &leaf2->hdr;
+
+ /*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum. Then get
+ * the direction to copy and the number of elements to move.
+ *
+ * "inleaf" is true if the new entry should be inserted into blk1.
+ * If "swap" is also true, then reverse the sense of "inleaf".
+ */
+ state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2,
+ &count, &totallen);
+ if (swap)
+ state->inleaf = !state->inleaf;
+
+ /*
+ * Move any entries required from leaf to leaf:
+ */
+ if (count < be16_to_cpu(hdr1->count)) {
+ /*
+ * Figure the total bytes to be added to the destination leaf.
+ */
+ /* number entries being moved */
+ count = be16_to_cpu(hdr1->count) - count;
+ space = be16_to_cpu(hdr1->usedbytes) - totallen;
+ space += count * sizeof(xfs_attr_leaf_entry_t);
+
+ /*
+ * leaf2 is the destination, compact it if it looks tight.
+ */
+ max = be16_to_cpu(hdr2->firstused)
+ - sizeof(xfs_attr_leaf_hdr_t);
+ max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
+ if (space > max) {
+ xfs_attr_leaf_compact(args->trans, blk2->bp);
+ }
+
+ /*
+ * Move high entries from leaf1 to low end of leaf2.
+ */
+ xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count,
+ leaf2, 0, count, state->mp);
+
+ xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+ xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+ } else if (count > be16_to_cpu(hdr1->count)) {
+ /*
+ * I assert that since all callers pass in an empty
+ * second buffer, this code should never execute.
+ */
+
+ /*
+ * Figure the total bytes to be added to the destination leaf.
+ */
+ /* number entries being moved */
+ count -= be16_to_cpu(hdr1->count);
+ space = totallen - be16_to_cpu(hdr1->usedbytes);
+ space += count * sizeof(xfs_attr_leaf_entry_t);
+
+ /*
+ * leaf1 is the destination, compact it if it looks tight.
+ */
+ max = be16_to_cpu(hdr1->firstused)
+ - sizeof(xfs_attr_leaf_hdr_t);
+ max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
+ if (space > max) {
+ xfs_attr_leaf_compact(args->trans, blk1->bp);
+ }
+
+ /*
+ * Move low entries from leaf2 to high end of leaf1.
+ */
+ xfs_attr_leaf_moveents(leaf2, 0, leaf1,
+ be16_to_cpu(hdr1->count), count, state->mp);
+
+ xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+ xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+ }
+
+ /*
+ * Copy out last hashval in each block for B-tree code.
+ */
+ blk1->hashval = be32_to_cpu(
+ leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval);
+ blk2->hashval = be32_to_cpu(
+ leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval);
+
+ /*
+ * Adjust the expected index for insertion.
+ * NOTE: this code depends on the (current) situation that the
+ * second block was originally empty.
+ *
+ * If the insertion point moved to the 2nd block, we must adjust
+ * the index. We must also track the entry just following the
+ * new entry for use in an "atomic rename" operation, that entry
+ * is always the "old" entry and the "new" entry is what we are
+ * inserting. The index/blkno fields refer to the "old" entry,
+ * while the index2/blkno2 fields refer to the "new" entry.
+ */
+ if (blk1->index > be16_to_cpu(leaf1->hdr.count)) {
+ ASSERT(state->inleaf == 0);
+ blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
+ args->index = args->index2 = blk2->index;
+ args->blkno = args->blkno2 = blk2->blkno;
+ } else if (blk1->index == be16_to_cpu(leaf1->hdr.count)) {
+ if (state->inleaf) {
+ args->index = blk1->index;
+ args->blkno = blk1->blkno;
+ args->index2 = 0;
+ args->blkno2 = blk2->blkno;
+ } else {
+ blk2->index = blk1->index
+ - be16_to_cpu(leaf1->hdr.count);
+ args->index = args->index2 = blk2->index;
+ args->blkno = args->blkno2 = blk2->blkno;
+ }
+ } else {
+ ASSERT(state->inleaf == 1);
+ args->index = args->index2 = blk1->index;
+ args->blkno = args->blkno2 = blk1->blkno;
+ }
+}
+
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary? With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *blk1,
+ xfs_da_state_blk_t *blk2,
+ int *countarg, int *usedbytesarg)
+{
+ xfs_attr_leafblock_t *leaf1, *leaf2;
+ xfs_attr_leaf_hdr_t *hdr1, *hdr2;
+ xfs_attr_leaf_entry_t *entry;
+ int count, max, index, totallen, half;
+ int lastdelta, foundit, tmp;
+
+ /*
+ * Set up environment.
+ */
+ leaf1 = blk1->bp->data;
+ leaf2 = blk2->bp->data;
+ hdr1 = &leaf1->hdr;
+ hdr2 = &leaf2->hdr;
+ foundit = 0;
+ totallen = 0;
+
+ /*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ */
+ max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count);
+ half = (max+1) * sizeof(*entry);
+ half += be16_to_cpu(hdr1->usedbytes) +
+ be16_to_cpu(hdr2->usedbytes) +
+ xfs_attr_leaf_newentsize(
+ state->args->namelen,
+ state->args->valuelen,
+ state->blocksize, NULL);
+ half /= 2;
+ lastdelta = state->blocksize;
+ entry = &leaf1->entries[0];
+ for (count = index = 0; count < max; entry++, index++, count++) {
+
+#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
+ /*
+ * The new entry is in the first block, account for it.
+ */
+ if (count == blk1->index) {
+ tmp = totallen + sizeof(*entry) +
+ xfs_attr_leaf_newentsize(
+ state->args->namelen,
+ state->args->valuelen,
+ state->blocksize, NULL);
+ if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+ break;
+ lastdelta = XFS_ATTR_ABS(half - tmp);
+ totallen = tmp;
+ foundit = 1;
+ }
+
+ /*
+ * Wrap around into the second block if necessary.
+ */
+ if (count == be16_to_cpu(hdr1->count)) {
+ leaf1 = leaf2;
+ entry = &leaf1->entries[0];
+ index = 0;
+ }
+
+ /*
+ * Figure out if next leaf entry would be too much.
+ */
+ tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
+ index);
+ if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+ break;
+ lastdelta = XFS_ATTR_ABS(half - tmp);
+ totallen = tmp;
+#undef XFS_ATTR_ABS
+ }
+
+ /*
+ * Calculate the number of usedbytes that will end up in lower block.
+ * If new entry not in lower block, fix up the count.
+ */
+ totallen -= count * sizeof(*entry);
+ if (foundit) {
+ totallen -= sizeof(*entry) +
+ xfs_attr_leaf_newentsize(
+ state->args->namelen,
+ state->args->valuelen,
+ state->blocksize, NULL);
+ }
+
+ *countarg = count;
+ *usedbytesarg = totallen;
+ return(foundit);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor. Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ *
+ * GROT: allow for INCOMPLETE entries in calculation.
+ */
+int
+xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_da_state_blk_t *blk;
+ xfs_da_blkinfo_t *info;
+ int count, bytes, forward, error, retval, i;
+ xfs_dablk_t blkno;
+ xfs_dabuf_t *bp;
+
+ /*
+ * Check for the degenerate case of the block being over 50% full.
+ * If so, it's not worth even looking to see if we might be able
+ * to coalesce with a sibling.
+ */
+ blk = &state->path.blk[ state->path.active-1 ];
+ info = blk->bp->data;
+ ASSERT(be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC);
+ leaf = (xfs_attr_leafblock_t *)info;
+ count = be16_to_cpu(leaf->hdr.count);
+ bytes = sizeof(xfs_attr_leaf_hdr_t) +
+ count * sizeof(xfs_attr_leaf_entry_t) +
+ be16_to_cpu(leaf->hdr.usedbytes);
+ if (bytes > (state->blocksize >> 1)) {
+ *action = 0; /* blk over 50%, don't try to join */
+ return(0);
+ }
+
+ /*
+ * Check for the degenerate case of the block being empty.
+ * If the block is empty, we'll simply delete it, no need to
+ * coalesce it with a sibling block. We choose (arbitrarily)
+ * to merge with the forward block unless it is NULL.
+ */
+ if (count == 0) {
+ /*
+ * Make altpath point to the block we want to keep and
+ * path point to the block we want to drop (this one).
+ */
+ forward = (info->forw != 0);
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ error = xfs_da_path_shift(state, &state->altpath, forward,
+ 0, &retval);
+ if (error)
+ return(error);
+ if (retval) {
+ *action = 0;
+ } else {
+ *action = 2;
+ }
+ return(0);
+ }
+
+ /*
+ * Examine each sibling block to see if we can coalesce with
+ * at least 25% free space to spare. We need to figure out
+ * whether to merge with the forward or the backward block.
+ * We prefer coalescing with the lower numbered sibling so as
+ * to shrink an attribute list over time.
+ */
+ /* start with smaller blk num */
+ forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
+ for (i = 0; i < 2; forward = !forward, i++) {
+ if (forward)
+ blkno = be32_to_cpu(info->forw);
+ else
+ blkno = be32_to_cpu(info->back);
+ if (blkno == 0)
+ continue;
+ error = xfs_da_read_buf(state->args->trans, state->args->dp,
+ blkno, -1, &bp, XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+
+ leaf = (xfs_attr_leafblock_t *)info;
+ count = be16_to_cpu(leaf->hdr.count);
+ bytes = state->blocksize - (state->blocksize>>2);
+ bytes -= be16_to_cpu(leaf->hdr.usedbytes);
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ count += be16_to_cpu(leaf->hdr.count);
+ bytes -= be16_to_cpu(leaf->hdr.usedbytes);
+ bytes -= count * sizeof(xfs_attr_leaf_entry_t);
+ bytes -= sizeof(xfs_attr_leaf_hdr_t);
+ xfs_da_brelse(state->args->trans, bp);
+ if (bytes >= 0)
+ break; /* fits with at least 25% to spare */
+ }
+ if (i >= 2) {
+ *action = 0;
+ return(0);
+ }
+
+ /*
+ * Make altpath point to the block we want to keep (the lower
+ * numbered block) and path point to the block we want to drop.
+ */
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ if (blkno < blk->blkno) {
+ error = xfs_da_path_shift(state, &state->altpath, forward,
+ 0, &retval);
+ } else {
+ error = xfs_da_path_shift(state, &state->path, forward,
+ 0, &retval);
+ }
+ if (error)
+ return(error);
+ if (retval) {
+ *action = 0;
+ } else {
+ *action = 1;
+ }
+ return(0);
+}
+
+/*
+ * Remove a name from the leaf attribute list structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_hdr_t *hdr;
+ xfs_attr_leaf_map_t *map;
+ xfs_attr_leaf_entry_t *entry;
+ int before, after, smallest, entsize;
+ int tablesize, tmp, i;
+ xfs_mount_t *mp;
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ hdr = &leaf->hdr;
+ mp = args->trans->t_mountp;
+ ASSERT((be16_to_cpu(hdr->count) > 0)
+ && (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8)));
+ ASSERT((args->index >= 0)
+ && (args->index < be16_to_cpu(hdr->count)));
+ ASSERT(be16_to_cpu(hdr->firstused) >=
+ ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr)));
+ entry = &leaf->entries[args->index];
+ ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused));
+ ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
+
+ /*
+ * Scan through free region table:
+ * check for adjacency of free'd entry with an existing one,
+ * find smallest free region in case we need to replace it,
+ * adjust any map that borders the entry table,
+ */
+ tablesize = be16_to_cpu(hdr->count) * sizeof(xfs_attr_leaf_entry_t)
+ + sizeof(xfs_attr_leaf_hdr_t);
+ map = &hdr->freemap[0];
+ tmp = be16_to_cpu(map->size);
+ before = after = -1;
+ smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
+ entsize = xfs_attr_leaf_entsize(leaf, args->index);
+ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
+ ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
+ ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
+ if (be16_to_cpu(map->base) == tablesize) {
+ be16_add_cpu(&map->base,
+ -((int)sizeof(xfs_attr_leaf_entry_t)));
+ be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t));
+ }
+
+ if ((be16_to_cpu(map->base) + be16_to_cpu(map->size))
+ == be16_to_cpu(entry->nameidx)) {
+ before = i;
+ } else if (be16_to_cpu(map->base)
+ == (be16_to_cpu(entry->nameidx) + entsize)) {
+ after = i;
+ } else if (be16_to_cpu(map->size) < tmp) {
+ tmp = be16_to_cpu(map->size);
+ smallest = i;
+ }
+ }
+
+ /*
+ * Coalesce adjacent freemap regions,
+ * or replace the smallest region.
+ */
+ if ((before >= 0) || (after >= 0)) {
+ if ((before >= 0) && (after >= 0)) {
+ map = &hdr->freemap[before];
+ be16_add_cpu(&map->size, entsize);
+ be16_add_cpu(&map->size,
+ be16_to_cpu(hdr->freemap[after].size));
+ hdr->freemap[after].base = 0;
+ hdr->freemap[after].size = 0;
+ } else if (before >= 0) {
+ map = &hdr->freemap[before];
+ be16_add_cpu(&map->size, entsize);
+ } else {
+ map = &hdr->freemap[after];
+ /* both on-disk, don't endian flip twice */
+ map->base = entry->nameidx;
+ be16_add_cpu(&map->size, entsize);
+ }
+ } else {
+ /*
+ * Replace smallest region (if it is smaller than free'd entry)
+ */
+ map = &hdr->freemap[smallest];
+ if (be16_to_cpu(map->size) < entsize) {
+ map->base = cpu_to_be16(be16_to_cpu(entry->nameidx));
+ map->size = cpu_to_be16(entsize);
+ }
+ }
+
+ /*
+ * Did we remove the first entry?
+ */
+ if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused))
+ smallest = 1;
+ else
+ smallest = 0;
+
+ /*
+ * Compress the remaining entries and zero out the removed stuff.
+ */
+ memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize);
+ be16_add_cpu(&hdr->usedbytes, -entsize);
+ xfs_da_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
+ entsize));
+
+ tmp = (be16_to_cpu(hdr->count) - args->index)
+ * sizeof(xfs_attr_leaf_entry_t);
+ memmove((char *)entry, (char *)(entry+1), tmp);
+ be16_add_cpu(&hdr->count, -1);
+ xfs_da_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+ entry = &leaf->entries[be16_to_cpu(hdr->count)];
+ memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t));
+
+ /*
+ * If we removed the first entry, re-find the first used byte
+ * in the name area. Note that if the entry was the "firstused",
+ * then we don't have a "hole" in our block resulting from
+ * removing the name.
+ */
+ if (smallest) {
+ tmp = XFS_LBSIZE(mp);
+ entry = &leaf->entries[0];
+ for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) {
+ ASSERT(be16_to_cpu(entry->nameidx) >=
+ be16_to_cpu(hdr->firstused));
+ ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
+
+ if (be16_to_cpu(entry->nameidx) < tmp)
+ tmp = be16_to_cpu(entry->nameidx);
+ }
+ hdr->firstused = cpu_to_be16(tmp);
+ if (!hdr->firstused) {
+ hdr->firstused = cpu_to_be16(
+ tmp - XFS_ATTR_LEAF_NAME_ALIGN);
+ }
+ } else {
+ hdr->holes = 1; /* mark as needing compaction */
+ }
+ xfs_da_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+
+ /*
+ * Check if leaf is less than 50% full, caller may want to
+ * "join" the leaf with a sibling if so.
+ */
+ tmp = sizeof(xfs_attr_leaf_hdr_t);
+ tmp += be16_to_cpu(leaf->hdr.count) * sizeof(xfs_attr_leaf_entry_t);
+ tmp += be16_to_cpu(leaf->hdr.usedbytes);
+ return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */
+}
+
+/*
+ * Move all the attribute list entries from drop_leaf into save_leaf.
+ */
+void
+xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+ xfs_da_state_blk_t *save_blk)
+{
+ xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
+ xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
+ xfs_mount_t *mp;
+ char *tmpbuffer;
+
+ /*
+ * Set up environment.
+ */
+ mp = state->mp;
+ ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+ drop_leaf = drop_blk->bp->data;
+ save_leaf = save_blk->bp->data;
+ ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ drop_hdr = &drop_leaf->hdr;
+ save_hdr = &save_leaf->hdr;
+
+ /*
+ * Save last hashval from dying block for later Btree fixup.
+ */
+ drop_blk->hashval = be32_to_cpu(
+ drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval);
+
+ /*
+ * Check if we need a temp buffer, or can we do it in place.
+ * Note that we don't check "leaf" for holes because we will
+ * always be dropping it, toosmall() decided that for us already.
+ */
+ if (save_hdr->holes == 0) {
+ /*
+ * dest leaf has no holes, so we add there. May need
+ * to make some room in the entry array.
+ */
+ if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
+ xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0,
+ be16_to_cpu(drop_hdr->count), mp);
+ } else {
+ xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf,
+ be16_to_cpu(save_hdr->count),
+ be16_to_cpu(drop_hdr->count), mp);
+ }
+ } else {
+ /*
+ * Destination has holes, so we make a temporary copy
+ * of the leaf and add them both to that.
+ */
+ tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
+ ASSERT(tmpbuffer != NULL);
+ memset(tmpbuffer, 0, state->blocksize);
+ tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+ tmp_hdr = &tmp_leaf->hdr;
+ tmp_hdr->info = save_hdr->info; /* struct copy */
+ tmp_hdr->count = 0;
+ tmp_hdr->firstused = cpu_to_be16(state->blocksize);
+ if (!tmp_hdr->firstused) {
+ tmp_hdr->firstused = cpu_to_be16(
+ state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN);
+ }
+ tmp_hdr->usedbytes = 0;
+ if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
+ xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
+ be16_to_cpu(drop_hdr->count), mp);
+ xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf,
+ be16_to_cpu(tmp_leaf->hdr.count),
+ be16_to_cpu(save_hdr->count), mp);
+ } else {
+ xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
+ be16_to_cpu(save_hdr->count), mp);
+ xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf,
+ be16_to_cpu(tmp_leaf->hdr.count),
+ be16_to_cpu(drop_hdr->count), mp);
+ }
+ memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize);
+ kmem_free(tmpbuffer);
+ }
+
+ xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
+ state->blocksize - 1);
+
+ /*
+ * Copy out last hashval in each block for B-tree code.
+ */
+ save_blk->hashval = be32_to_cpu(
+ save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node. The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in args->index the index into the entry[] array of either
+ * the found entry, or where the entry should have been (insert before
+ * that entry).
+ *
+ * Don't change the args->value unless we find the attribute.
+ */
+int
+xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_entry_t *entry;
+ xfs_attr_leaf_name_local_t *name_loc;
+ xfs_attr_leaf_name_remote_t *name_rmt;
+ int probe, span;
+ xfs_dahash_t hashval;
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(be16_to_cpu(leaf->hdr.count)
+ < (XFS_LBSIZE(args->dp->i_mount)/8));
+
+ /*
+ * Binary search. (note: small blocks will skip this loop)
+ */
+ hashval = args->hashval;
+ probe = span = be16_to_cpu(leaf->hdr.count) / 2;
+ for (entry = &leaf->entries[probe]; span > 4;
+ entry = &leaf->entries[probe]) {
+ span /= 2;
+ if (be32_to_cpu(entry->hashval) < hashval)
+ probe += span;
+ else if (be32_to_cpu(entry->hashval) > hashval)
+ probe -= span;
+ else
+ break;
+ }
+ ASSERT((probe >= 0) &&
+ (!leaf->hdr.count
+ || (probe < be16_to_cpu(leaf->hdr.count))));
+ ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval));
+
+ /*
+ * Since we may have duplicate hashval's, find the first matching
+ * hashval in the leaf.
+ */
+ while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) {
+ entry--;
+ probe--;
+ }
+ while ((probe < be16_to_cpu(leaf->hdr.count)) &&
+ (be32_to_cpu(entry->hashval) < hashval)) {
+ entry++;
+ probe++;
+ }
+ if ((probe == be16_to_cpu(leaf->hdr.count)) ||
+ (be32_to_cpu(entry->hashval) != hashval)) {
+ args->index = probe;
+ return(XFS_ERROR(ENOATTR));
+ }
+
+ /*
+ * Duplicate keys may be present, so search all of them for a match.
+ */
+ for ( ; (probe < be16_to_cpu(leaf->hdr.count)) &&
+ (be32_to_cpu(entry->hashval) == hashval);
+ entry++, probe++) {
+/*
+ * GROT: Add code to remove incomplete entries.
+ */
+ /*
+ * If we are looking for INCOMPLETE entries, show only those.
+ * If we are looking for complete entries, show only those.
+ */
+ if ((args->flags & XFS_ATTR_INCOMPLETE) !=
+ (entry->flags & XFS_ATTR_INCOMPLETE)) {
+ continue;
+ }
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr_leaf_name_local(leaf, probe);
+ if (name_loc->namelen != args->namelen)
+ continue;
+ if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0)
+ continue;
+ if (!xfs_attr_namesp_match(args->flags, entry->flags))
+ continue;
+ args->index = probe;
+ return(XFS_ERROR(EEXIST));
+ } else {
+ name_rmt = xfs_attr_leaf_name_remote(leaf, probe);
+ if (name_rmt->namelen != args->namelen)
+ continue;
+ if (memcmp(args->name, (char *)name_rmt->name,
+ args->namelen) != 0)
+ continue;
+ if (!xfs_attr_namesp_match(args->flags, entry->flags))
+ continue;
+ args->index = probe;
+ args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+ args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
+ be32_to_cpu(name_rmt->valuelen));
+ return(XFS_ERROR(EEXIST));
+ }
+ }
+ args->index = probe;
+ return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Get the value associated with an attribute name from a leaf attribute
+ * list structure.
+ */
+int
+xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+ int valuelen;
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_entry_t *entry;
+ xfs_attr_leaf_name_local_t *name_loc;
+ xfs_attr_leaf_name_remote_t *name_rmt;
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(be16_to_cpu(leaf->hdr.count)
+ < (XFS_LBSIZE(args->dp->i_mount)/8));
+ ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+
+ entry = &leaf->entries[args->index];
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+ ASSERT(name_loc->namelen == args->namelen);
+ ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
+ valuelen = be16_to_cpu(name_loc->valuelen);
+ if (args->flags & ATTR_KERNOVAL) {
+ args->valuelen = valuelen;
+ return(0);
+ }
+ if (args->valuelen < valuelen) {
+ args->valuelen = valuelen;
+ return(XFS_ERROR(ERANGE));
+ }
+ args->valuelen = valuelen;
+ memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
+ } else {
+ name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ ASSERT(name_rmt->namelen == args->namelen);
+ ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+ valuelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+ args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
+ if (args->flags & ATTR_KERNOVAL) {
+ args->valuelen = valuelen;
+ return(0);
+ }
+ if (args->valuelen < valuelen) {
+ args->valuelen = valuelen;
+ return(XFS_ERROR(ERANGE));
+ }
+ args->valuelen = valuelen;
+ }
+ return(0);
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
+ xfs_attr_leafblock_t *leaf_d, int start_d,
+ int count, xfs_mount_t *mp)
+{
+ xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+ xfs_attr_leaf_entry_t *entry_s, *entry_d;
+ int desti, tmp, i;
+
+ /*
+ * Check for nothing to do.
+ */
+ if (count == 0)
+ return;
+
+ /*
+ * Set up environment.
+ */
+ ASSERT(be16_to_cpu(leaf_s->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(be16_to_cpu(leaf_d->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ hdr_s = &leaf_s->hdr;
+ hdr_d = &leaf_d->hdr;
+ ASSERT((be16_to_cpu(hdr_s->count) > 0) &&
+ (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8)));
+ ASSERT(be16_to_cpu(hdr_s->firstused) >=
+ ((be16_to_cpu(hdr_s->count)
+ * sizeof(*entry_s))+sizeof(*hdr_s)));
+ ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8));
+ ASSERT(be16_to_cpu(hdr_d->firstused) >=
+ ((be16_to_cpu(hdr_d->count)
+ * sizeof(*entry_d))+sizeof(*hdr_d)));
+
+ ASSERT(start_s < be16_to_cpu(hdr_s->count));
+ ASSERT(start_d <= be16_to_cpu(hdr_d->count));
+ ASSERT(count <= be16_to_cpu(hdr_s->count));
+
+ /*
+ * Move the entries in the destination leaf up to make a hole?
+ */
+ if (start_d < be16_to_cpu(hdr_d->count)) {
+ tmp = be16_to_cpu(hdr_d->count) - start_d;
+ tmp *= sizeof(xfs_attr_leaf_entry_t);
+ entry_s = &leaf_d->entries[start_d];
+ entry_d = &leaf_d->entries[start_d + count];
+ memmove((char *)entry_d, (char *)entry_s, tmp);
+ }
+
+ /*
+ * Copy all entry's in the same (sorted) order,
+ * but allocate attribute info packed and in sequence.
+ */
+ entry_s = &leaf_s->entries[start_s];
+ entry_d = &leaf_d->entries[start_d];
+ desti = start_d;
+ for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
+ ASSERT(be16_to_cpu(entry_s->nameidx)
+ >= be16_to_cpu(hdr_s->firstused));
+ tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
+#ifdef GROT
+ /*
+ * Code to drop INCOMPLETE entries. Difficult to use as we
+ * may also need to change the insertion index. Code turned
+ * off for 6.2, should be revisited later.
+ */
+ if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
+ memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
+ be16_add_cpu(&hdr_s->usedbytes, -tmp);
+ be16_add_cpu(&hdr_s->count, -1);
+ entry_d--; /* to compensate for ++ in loop hdr */
+ desti--;
+ if ((start_s + i) < offset)
+ result++; /* insertion index adjustment */
+ } else {
+#endif /* GROT */
+ be16_add_cpu(&hdr_d->firstused, -tmp);
+ /* both on-disk, don't endian flip twice */
+ entry_d->hashval = entry_s->hashval;
+ /* both on-disk, don't endian flip twice */
+ entry_d->nameidx = hdr_d->firstused;
+ entry_d->flags = entry_s->flags;
+ ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
+ <= XFS_LBSIZE(mp));
+ memmove(xfs_attr_leaf_name(leaf_d, desti),
+ xfs_attr_leaf_name(leaf_s, start_s + i), tmp);
+ ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
+ <= XFS_LBSIZE(mp));
+ memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
+ be16_add_cpu(&hdr_s->usedbytes, -tmp);
+ be16_add_cpu(&hdr_d->usedbytes, tmp);
+ be16_add_cpu(&hdr_s->count, -1);
+ be16_add_cpu(&hdr_d->count, 1);
+ tmp = be16_to_cpu(hdr_d->count)
+ * sizeof(xfs_attr_leaf_entry_t)
+ + sizeof(xfs_attr_leaf_hdr_t);
+ ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp);
+#ifdef GROT
+ }
+#endif /* GROT */
+ }
+
+ /*
+ * Zero out the entries we just copied.
+ */
+ if (start_s == be16_to_cpu(hdr_s->count)) {
+ tmp = count * sizeof(xfs_attr_leaf_entry_t);
+ entry_s = &leaf_s->entries[start_s];
+ ASSERT(((char *)entry_s + tmp) <=
+ ((char *)leaf_s + XFS_LBSIZE(mp)));
+ memset((char *)entry_s, 0, tmp);
+ } else {
+ /*
+ * Move the remaining entries down to fill the hole,
+ * then zero the entries at the top.
+ */
+ tmp = be16_to_cpu(hdr_s->count) - count;
+ tmp *= sizeof(xfs_attr_leaf_entry_t);
+ entry_s = &leaf_s->entries[start_s + count];
+ entry_d = &leaf_s->entries[start_s];
+ memmove((char *)entry_d, (char *)entry_s, tmp);
+
+ tmp = count * sizeof(xfs_attr_leaf_entry_t);
+ entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)];
+ ASSERT(((char *)entry_s + tmp) <=
+ ((char *)leaf_s + XFS_LBSIZE(mp)));
+ memset((char *)entry_s, 0, tmp);
+ }
+
+ /*
+ * Fill in the freemap information
+ */
+ hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
+ be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) *
+ sizeof(xfs_attr_leaf_entry_t));
+ hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused)
+ - be16_to_cpu(hdr_d->freemap[0].base));
+ hdr_d->freemap[1].base = 0;
+ hdr_d->freemap[2].base = 0;
+ hdr_d->freemap[1].size = 0;
+ hdr_d->freemap[2].size = 0;
+ hdr_s->holes = 1; /* leaf may not be compact */
+}
+
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+int
+xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
+{
+ xfs_attr_leafblock_t *leaf1, *leaf2;
+
+ leaf1 = leaf1_bp->data;
+ leaf2 = leaf2_bp->data;
+ ASSERT((be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC) &&
+ (be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC));
+ if ((be16_to_cpu(leaf1->hdr.count) > 0) &&
+ (be16_to_cpu(leaf2->hdr.count) > 0) &&
+ ((be32_to_cpu(leaf2->entries[0].hashval) <
+ be32_to_cpu(leaf1->entries[0].hashval)) ||
+ (be32_to_cpu(leaf2->entries[
+ be16_to_cpu(leaf2->hdr.count)-1].hashval) <
+ be32_to_cpu(leaf1->entries[
+ be16_to_cpu(leaf1->hdr.count)-1].hashval)))) {
+ return(1);
+ }
+ return(0);
+}
+
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count)
+{
+ xfs_attr_leafblock_t *leaf;
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ if (count)
+ *count = be16_to_cpu(leaf->hdr.count);
+ if (!leaf->hdr.count)
+ return(0);
+ return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval);
+}
+
+/*
+ * Calculate the number of bytes used to store the indicated attribute
+ * (whether local or remote only calculate bytes in this block).
+ */
+STATIC int
+xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
+{
+ xfs_attr_leaf_name_local_t *name_loc;
+ xfs_attr_leaf_name_remote_t *name_rmt;
+ int size;
+
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr_leaf_name_local(leaf, index);
+ size = xfs_attr_leaf_entsize_local(name_loc->namelen,
+ be16_to_cpu(name_loc->valuelen));
+ } else {
+ name_rmt = xfs_attr_leaf_name_remote(leaf, index);
+ size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
+ }
+ return(size);
+}
+
+/*
+ * Calculate the number of bytes that would be required to store the new
+ * attribute (whether local or remote only calculate bytes in this block).
+ * This routine decides as a side effect whether the attribute will be
+ * a "local" or a "remote" attribute.
+ */
+int
+xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
+{
+ int size;
+
+ size = xfs_attr_leaf_entsize_local(namelen, valuelen);
+ if (size < xfs_attr_leaf_entsize_local_max(blocksize)) {
+ if (local) {
+ *local = 1;
+ }
+ } else {
+ size = xfs_attr_leaf_entsize_remote(namelen);
+ if (local) {
+ *local = 0;
+ }
+ }
+ return(size);
+}
+
+/*
+ * Copy out attribute list entries for attr_list(), for leaf attribute lists.
+ */
+int
+xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
+{
+ attrlist_cursor_kern_t *cursor;
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_entry_t *entry;
+ int retval, i;
+
+ ASSERT(bp != NULL);
+ leaf = bp->data;
+ cursor = context->cursor;
+ cursor->initted = 1;
+
+ trace_xfs_attr_list_leaf(context);
+
+ /*
+ * Re-find our place in the leaf block if this is a new syscall.
+ */
+ if (context->resynch) {
+ entry = &leaf->entries[0];
+ for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+ if (be32_to_cpu(entry->hashval) == cursor->hashval) {
+ if (cursor->offset == context->dupcnt) {
+ context->dupcnt = 0;
+ break;
+ }
+ context->dupcnt++;
+ } else if (be32_to_cpu(entry->hashval) >
+ cursor->hashval) {
+ context->dupcnt = 0;
+ break;
+ }
+ }
+ if (i == be16_to_cpu(leaf->hdr.count)) {
+ trace_xfs_attr_list_notfound(context);
+ return(0);
+ }
+ } else {
+ entry = &leaf->entries[0];
+ i = 0;
+ }
+ context->resynch = 0;
+
+ /*
+ * We have found our place, start copying out the new attributes.
+ */
+ retval = 0;
+ for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) {
+ if (be32_to_cpu(entry->hashval) != cursor->hashval) {
+ cursor->hashval = be32_to_cpu(entry->hashval);
+ cursor->offset = 0;
+ }
+
+ if (entry->flags & XFS_ATTR_INCOMPLETE)
+ continue; /* skip incomplete entries */
+
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ xfs_attr_leaf_name_local_t *name_loc =
+ xfs_attr_leaf_name_local(leaf, i);
+
+ retval = context->put_listent(context,
+ entry->flags,
+ name_loc->nameval,
+ (int)name_loc->namelen,
+ be16_to_cpu(name_loc->valuelen),
+ &name_loc->nameval[name_loc->namelen]);
+ if (retval)
+ return retval;
+ } else {
+ xfs_attr_leaf_name_remote_t *name_rmt =
+ xfs_attr_leaf_name_remote(leaf, i);
+
+ int valuelen = be32_to_cpu(name_rmt->valuelen);
+
+ if (context->put_value) {
+ xfs_da_args_t args;
+
+ memset((char *)&args, 0, sizeof(args));
+ args.dp = context->dp;
+ args.whichfork = XFS_ATTR_FORK;
+ args.valuelen = valuelen;
+ args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
+ args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
+ args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
+ retval = xfs_attr_rmtval_get(&args);
+ if (retval)
+ return retval;
+ retval = context->put_listent(context,
+ entry->flags,
+ name_rmt->name,
+ (int)name_rmt->namelen,
+ valuelen,
+ args.value);
+ kmem_free(args.value);
+ } else {
+ retval = context->put_listent(context,
+ entry->flags,
+ name_rmt->name,
+ (int)name_rmt->namelen,
+ valuelen,
+ NULL);
+ }
+ if (retval)
+ return retval;
+ }
+ if (context->seen_enough)
+ break;
+ cursor->offset++;
+ }
+ trace_xfs_attr_list_leaf_end(context);
+ return(retval);
+}
+
+
+/*========================================================================
+ * Manage the INCOMPLETE flag in a leaf entry
+ *========================================================================*/
+
+/*
+ * Clear the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr_leaf_clearflag(xfs_da_args_t *args)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_entry_t *entry;
+ xfs_attr_leaf_name_remote_t *name_rmt;
+ xfs_dabuf_t *bp;
+ int error;
+#ifdef DEBUG
+ xfs_attr_leaf_name_local_t *name_loc;
+ int namelen;
+ char *name;
+#endif /* DEBUG */
+
+ /*
+ * Set up the operation.
+ */
+ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error) {
+ return(error);
+ }
+ ASSERT(bp != NULL);
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+ ASSERT(args->index >= 0);
+ entry = &leaf->entries[ args->index ];
+ ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
+
+#ifdef DEBUG
+ if (entry->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+ namelen = name_loc->namelen;
+ name = (char *)name_loc->nameval;
+ } else {
+ name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ namelen = name_rmt->namelen;
+ name = (char *)name_rmt->name;
+ }
+ ASSERT(be32_to_cpu(entry->hashval) == args->hashval);
+ ASSERT(namelen == args->namelen);
+ ASSERT(memcmp(name, args->name, namelen) == 0);
+#endif /* DEBUG */
+
+ entry->flags &= ~XFS_ATTR_INCOMPLETE;
+ xfs_da_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+
+ if (args->rmtblkno) {
+ ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
+ name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+ name_rmt->valuelen = cpu_to_be32(args->valuelen);
+ xfs_da_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+ }
+ xfs_da_buf_done(bp);
+
+ /*
+ * Commit the flag value change and start the next trans in series.
+ */
+ return xfs_trans_roll(&args->trans, args->dp);
+}
+
+/*
+ * Set the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr_leaf_setflag(xfs_da_args_t *args)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_entry_t *entry;
+ xfs_attr_leaf_name_remote_t *name_rmt;
+ xfs_dabuf_t *bp;
+ int error;
+
+ /*
+ * Set up the operation.
+ */
+ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+ XFS_ATTR_FORK);
+ if (error) {
+ return(error);
+ }
+ ASSERT(bp != NULL);
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+ ASSERT(args->index >= 0);
+ entry = &leaf->entries[ args->index ];
+
+ ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
+ entry->flags |= XFS_ATTR_INCOMPLETE;
+ xfs_da_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+ if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
+ name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+ name_rmt->valueblk = 0;
+ name_rmt->valuelen = 0;
+ xfs_da_log_buf(args->trans, bp,
+ XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+ }
+ xfs_da_buf_done(bp);
+
+ /*
+ * Commit the flag value change and start the next trans in series.
+ */
+ return xfs_trans_roll(&args->trans, args->dp);
+}
+
+/*
+ * In a single transaction, clear the INCOMPLETE flag on the leaf entry
+ * given by args->blkno/index and set the INCOMPLETE flag on the leaf
+ * entry given by args->blkno2/index2.
+ *
+ * Note that they could be in different blocks, or in the same block.
+ */
+int
+xfs_attr_leaf_flipflags(xfs_da_args_t *args)
+{
+ xfs_attr_leafblock_t *leaf1, *leaf2;
+ xfs_attr_leaf_entry_t *entry1, *entry2;
+ xfs_attr_leaf_name_remote_t *name_rmt;
+ xfs_dabuf_t *bp1, *bp2;
+ int error;
+#ifdef DEBUG
+ xfs_attr_leaf_name_local_t *name_loc;
+ int namelen1, namelen2;
+ char *name1, *name2;
+#endif /* DEBUG */
+
+ /*
+ * Read the block containing the "old" attr
+ */
+ error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
+ XFS_ATTR_FORK);
+ if (error) {
+ return(error);
+ }
+ ASSERT(bp1 != NULL);
+
+ /*
+ * Read the block containing the "new" attr, if it is different
+ */
+ if (args->blkno2 != args->blkno) {
+ error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
+ -1, &bp2, XFS_ATTR_FORK);
+ if (error) {
+ return(error);
+ }
+ ASSERT(bp2 != NULL);
+ } else {
+ bp2 = bp1;
+ }
+
+ leaf1 = bp1->data;
+ ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
+ ASSERT(args->index >= 0);
+ entry1 = &leaf1->entries[ args->index ];
+
+ leaf2 = bp2->data;
+ ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
+ ASSERT(args->index2 >= 0);
+ entry2 = &leaf2->entries[ args->index2 ];
+
+#ifdef DEBUG
+ if (entry1->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr_leaf_name_local(leaf1, args->index);
+ namelen1 = name_loc->namelen;
+ name1 = (char *)name_loc->nameval;
+ } else {
+ name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
+ namelen1 = name_rmt->namelen;
+ name1 = (char *)name_rmt->name;
+ }
+ if (entry2->flags & XFS_ATTR_LOCAL) {
+ name_loc = xfs_attr_leaf_name_local(leaf2, args->index2);
+ namelen2 = name_loc->namelen;
+ name2 = (char *)name_loc->nameval;
+ } else {
+ name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
+ namelen2 = name_rmt->namelen;
+ name2 = (char *)name_rmt->name;
+ }
+ ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval));
+ ASSERT(namelen1 == namelen2);
+ ASSERT(memcmp(name1, name2, namelen1) == 0);
+#endif /* DEBUG */
+
+ ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
+ ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
+
+ entry1->flags &= ~XFS_ATTR_INCOMPLETE;
+ xfs_da_log_buf(args->trans, bp1,
+ XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
+ if (args->rmtblkno) {
+ ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
+ name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
+ name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
+ name_rmt->valuelen = cpu_to_be32(args->valuelen);
+ xfs_da_log_buf(args->trans, bp1,
+ XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
+ }
+
+ entry2->flags |= XFS_ATTR_INCOMPLETE;
+ xfs_da_log_buf(args->trans, bp2,
+ XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
+ if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
+ name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
+ name_rmt->valueblk = 0;
+ name_rmt->valuelen = 0;
+ xfs_da_log_buf(args->trans, bp2,
+ XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
+ }
+ xfs_da_buf_done(bp1);
+ if (bp1 != bp2)
+ xfs_da_buf_done(bp2);
+
+ /*
+ * Commit the flag value change and start the next trans in series.
+ */
+ error = xfs_trans_roll(&args->trans, args->dp);
+
+ return(error);
+}
+
+/*========================================================================
+ * Indiscriminately delete the entire attribute fork
+ *========================================================================*/
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
+{
+ xfs_da_blkinfo_t *info;
+ xfs_daddr_t blkno;
+ xfs_dabuf_t *bp;
+ int error;
+
+ /*
+ * Read block 0 to see what we have to work with.
+ * We only get here if we have extents, since we remove
+ * the extents in reverse order the extent containing
+ * block 0 must still be there.
+ */
+ error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ blkno = xfs_da_blkno(bp);
+
+ /*
+ * Invalidate the tree, even if the "tree" is only a single leaf block.
+ * This is a depth-first traversal!
+ */
+ info = bp->data;
+ if (be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC) {
+ error = xfs_attr_node_inactive(trans, dp, bp, 1);
+ } else if (be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC) {
+ error = xfs_attr_leaf_inactive(trans, dp, bp);
+ } else {
+ error = XFS_ERROR(EIO);
+ xfs_da_brelse(*trans, bp);
+ }
+ if (error)
+ return(error);
+
+ /*
+ * Invalidate the incore copy of the root block.
+ */
+ error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ xfs_da_binval(*trans, bp); /* remove from cache */
+ /*
+ * Commit the invalidate and start the next transaction.
+ */
+ error = xfs_trans_roll(trans, dp);
+
+ return (error);
+}
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+STATIC int
+xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
+ int level)
+{
+ xfs_da_blkinfo_t *info;
+ xfs_da_intnode_t *node;
+ xfs_dablk_t child_fsb;
+ xfs_daddr_t parent_blkno, child_blkno;
+ int error, count, i;
+ xfs_dabuf_t *child_bp;
+
+ /*
+ * Since this code is recursive (gasp!) we must protect ourselves.
+ */
+ if (level > XFS_DA_NODE_MAXDEPTH) {
+ xfs_da_brelse(*trans, bp); /* no locks for later trans */
+ return(XFS_ERROR(EIO));
+ }
+
+ node = bp->data;
+ ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+ parent_blkno = xfs_da_blkno(bp); /* save for re-read later */
+ count = be16_to_cpu(node->hdr.count);
+ if (!count) {
+ xfs_da_brelse(*trans, bp);
+ return(0);
+ }
+ child_fsb = be32_to_cpu(node->btree[0].before);
+ xfs_da_brelse(*trans, bp); /* no locks for later trans */
+
+ /*
+ * If this is the node level just above the leaves, simply loop
+ * over the leaves removing all of them. If this is higher up
+ * in the tree, recurse downward.
+ */
+ for (i = 0; i < count; i++) {
+ /*
+ * Read the subsidiary block to see what we have to work with.
+ * Don't do this in a transaction. This is a depth-first
+ * traversal of the tree so we may deal with many blocks
+ * before we come back to this one.
+ */
+ error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
+ XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ if (child_bp) {
+ /* save for re-read later */
+ child_blkno = xfs_da_blkno(child_bp);
+
+ /*
+ * Invalidate the subtree, however we have to.
+ */
+ info = child_bp->data;
+ if (be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC) {
+ error = xfs_attr_node_inactive(trans, dp,
+ child_bp, level+1);
+ } else if (be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC) {
+ error = xfs_attr_leaf_inactive(trans, dp,
+ child_bp);
+ } else {
+ error = XFS_ERROR(EIO);
+ xfs_da_brelse(*trans, child_bp);
+ }
+ if (error)
+ return(error);
+
+ /*
+ * Remove the subsidiary block from the cache
+ * and from the log.
+ */
+ error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
+ &child_bp, XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ xfs_da_binval(*trans, child_bp);
+ }
+
+ /*
+ * If we're not done, re-read the parent to get the next
+ * child block number.
+ */
+ if ((i+1) < count) {
+ error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
+ &bp, XFS_ATTR_FORK);
+ if (error)
+ return(error);
+ child_fsb = be32_to_cpu(node->btree[i+1].before);
+ xfs_da_brelse(*trans, bp);
+ }
+ /*
+ * Atomically commit the whole invalidate stuff.
+ */
+ error = xfs_trans_roll(trans, dp);
+ if (error)
+ return (error);
+ }
+
+ return(0);
+}
+
+/*
+ * Invalidate all of the "remote" value regions pointed to by a particular
+ * leaf block.
+ * Note that we must release the lock on the buffer so that we are not
+ * caught holding something that the logging code wants to flush to disk.
+ */
+STATIC int
+xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
+{
+ xfs_attr_leafblock_t *leaf;
+ xfs_attr_leaf_entry_t *entry;
+ xfs_attr_leaf_name_remote_t *name_rmt;
+ xfs_attr_inactive_list_t *list, *lp;
+ int error, count, size, tmp, i;
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
+
+ /*
+ * Count the number of "remote" value extents.
+ */
+ count = 0;
+ entry = &leaf->entries[0];
+ for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+ if (be16_to_cpu(entry->nameidx) &&
+ ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+ name_rmt = xfs_attr_leaf_name_remote(leaf, i);
+ if (name_rmt->valueblk)
+ count++;
+ }
+ }
+
+ /*
+ * If there are no "remote" values, we're done.
+ */
+ if (count == 0) {
+ xfs_da_brelse(*trans, bp);
+ return(0);
+ }
+
+ /*
+ * Allocate storage for a list of all the "remote" value extents.
+ */
+ size = count * sizeof(xfs_attr_inactive_list_t);
+ list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP);
+
+ /*
+ * Identify each of the "remote" value extents.
+ */
+ lp = list;
+ entry = &leaf->entries[0];
+ for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+ if (be16_to_cpu(entry->nameidx) &&
+ ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+ name_rmt = xfs_attr_leaf_name_remote(leaf, i);
+ if (name_rmt->valueblk) {
+ lp->valueblk = be32_to_cpu(name_rmt->valueblk);
+ lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
+ be32_to_cpu(name_rmt->valuelen));
+ lp++;
+ }
+ }
+ }
+ xfs_da_brelse(*trans, bp); /* unlock for trans. in freextent() */
+
+ /*
+ * Invalidate each of the "remote" value extents.
+ */
+ error = 0;
+ for (lp = list, i = 0; i < count; i++, lp++) {
+ tmp = xfs_attr_leaf_freextent(trans, dp,
+ lp->valueblk, lp->valuelen);
+
+ if (error == 0)
+ error = tmp; /* save only the 1st errno */
+ }
+
+ kmem_free((xfs_caddr_t)list);
+ return(error);
+}
+
+/*
+ * Look at all the extents for this logical region,
+ * invalidate any buffers that are incore/in transactions.
+ */
+STATIC int
+xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
+ xfs_dablk_t blkno, int blkcnt)
+{
+ xfs_bmbt_irec_t map;
+ xfs_dablk_t tblkno;
+ int tblkcnt, dblkcnt, nmap, error;
+ xfs_daddr_t dblkno;
+ xfs_buf_t *bp;
+
+ /*
+ * Roll through the "value", invalidating the attribute value's
+ * blocks.
+ */
+ tblkno = blkno;
+ tblkcnt = blkcnt;
+ while (tblkcnt > 0) {
+ /*
+ * Try to remember where we decided to put the value.
+ */
+ nmap = 1;
+ error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
+ XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+ NULL, 0, &map, &nmap, NULL);
+ if (error) {
+ return(error);
+ }
+ ASSERT(nmap == 1);
+ ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+
+ /*
+ * If it's a hole, these are already unmapped
+ * so there's nothing to invalidate.
+ */
+ if (map.br_startblock != HOLESTARTBLOCK) {
+
+ dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
+ map.br_startblock);
+ dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
+ map.br_blockcount);
+ bp = xfs_trans_get_buf(*trans,
+ dp->i_mount->m_ddev_targp,
+ dblkno, dblkcnt, XBF_LOCK);
+ xfs_trans_binval(*trans, bp);
+ /*
+ * Roll to next transaction.
+ */
+ error = xfs_trans_roll(trans, dp);
+ if (error)
+ return (error);
+ }
+
+ tblkno += map.br_blockcount;
+ tblkcnt -= map.br_blockcount;
+ }
+
+ return(0);
+}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
new file mode 100644
index 00000000..9c7d22fd
--- /dev/null
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ATTR_LEAF_H__
+#define __XFS_ATTR_LEAF_H__
+
+/*
+ * Attribute storage layout, internal structure, access macros, etc.
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes. Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree. Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys. The
+ * internal links in the Btree are logical block offsets into the file.
+ */
+
+struct attrlist;
+struct attrlist_cursor_kern;
+struct xfs_attr_list_context;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*========================================================================
+ * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+
+/*
+ * This is the structure of the leaf nodes in the Btree.
+ *
+ * Struct leaf_entry's are packed from the top. Name/values grow from the
+ * bottom but are not packed. The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped. When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again. If we
+ * still don't have enough space, then we have to split the block. The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string. The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard. If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry. The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store an "incomplete" bit in the leaf_entry. It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set. We clear the
+ * bit when we have finished setting up the attribute. We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
+
+typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
+ __be16 base; /* base of free region */
+ __be16 size; /* length of free region */
+} xfs_attr_leaf_map_t;
+
+typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
+ xfs_da_blkinfo_t info; /* block type, links, etc. */
+ __be16 count; /* count of active leaf_entry's */
+ __be16 usedbytes; /* num bytes of names/values stored */
+ __be16 firstused; /* first used byte in name area */
+ __u8 holes; /* != 0 if blk needs compaction */
+ __u8 pad1;
+ xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+ /* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+
+typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
+ __be32 hashval; /* hash value of name */
+ __be16 nameidx; /* index into buffer of name/value */
+ __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+ __u8 pad2; /* unused pad byte */
+} xfs_attr_leaf_entry_t;
+
+typedef struct xfs_attr_leaf_name_local {
+ __be16 valuelen; /* number of bytes in value */
+ __u8 namelen; /* length of name bytes */
+ __u8 nameval[1]; /* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+
+typedef struct xfs_attr_leaf_name_remote {
+ __be32 valueblk; /* block number of value bytes */
+ __be32 valuelen; /* number of bytes in value */
+ __u8 namelen; /* length of name bytes */
+ __u8 name[1]; /* name bytes */
+} xfs_attr_leaf_name_remote_t;
+
+typedef struct xfs_attr_leafblock {
+ xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
+ xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
+ xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
+ xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
+ */
+#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
+#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
+#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
+#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
+
+/*
+ * Conversion macros for converting namespace bits from argument flags
+ * to ondisk flags.
+ */
+#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
+#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK)
+#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
+ ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
+#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
+ ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
+
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+ return (xfs_attr_leaf_name_remote_t *)
+ &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+}
+
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+ return (xfs_attr_leaf_name_local_t *)
+ &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+}
+
+static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+ return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+}
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+static inline int xfs_attr_leaf_entsize_remote(int nlen)
+{
+ return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+ XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+ return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
+ XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local_max(int bsize)
+{
+ return (((bsize) >> 1) + ((bsize) >> 2));
+}
+
+/*
+ * Used to keep a list of "remote value" extents when unlinking an inode.
+ */
+typedef struct xfs_attr_inactive_list {
+ xfs_dablk_t valueblk; /* block number of value bytes */
+ int valuelen; /* number of bytes in value */
+} xfs_attr_inactive_list_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute fork size < XFS_LITINO(mp).
+ */
+void xfs_attr_shortform_create(struct xfs_da_args *args);
+void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
+int xfs_attr_shortform_lookup(struct xfs_da_args *args);
+int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
+int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int xfs_attr_shortform_remove(struct xfs_da_args *args);
+int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
+int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp);
+int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
+
+
+/*
+ * Internal routines when attribute fork size == XFS_LBSIZE(mp).
+ */
+int xfs_attr_leaf_to_node(struct xfs_da_args *args);
+int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp,
+ struct xfs_da_args *args, int forkoff);
+int xfs_attr_leaf_clearflag(struct xfs_da_args *args);
+int xfs_attr_leaf_setflag(struct xfs_da_args *args);
+int xfs_attr_leaf_flipflags(xfs_da_args_t *args);
+
+/*
+ * Routines used for growing the Btree.
+ */
+int xfs_attr_leaf_split(struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk);
+int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf,
+ struct xfs_da_args *args);
+int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args);
+int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer,
+ struct xfs_da_args *args);
+int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer,
+ struct xfs_da_args *args);
+int xfs_attr_leaf_list_int(struct xfs_dabuf *bp,
+ struct xfs_attr_list_context *context);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void xfs_attr_leaf_unbalance(struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk);
+int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+
+/*
+ * Utility routines.
+ */
+xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count);
+int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
+ struct xfs_dabuf *leaf2_bp);
+int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
+ int *local);
+#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
new file mode 100644
index 00000000..919756e3
--- /dev/null
+++ b/fs/xfs/xfs_attr_sf.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ATTR_SF_H__
+#define __XFS_ATTR_SF_H__
+
+/*
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as
+ * to fit into the literal area of the inode.
+ */
+
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+ struct xfs_attr_sf_hdr { /* constant-structure header block */
+ __be16 totsize; /* total bytes in shortform list */
+ __u8 count; /* count of active entries */
+ } hdr;
+ struct xfs_attr_sf_entry {
+ __uint8_t namelen; /* actual length of name (no NULL) */
+ __uint8_t valuelen; /* actual length of value (no NULL) */
+ __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
+ __uint8_t nameval[1]; /* name & value bytes concatenated */
+ } list[1]; /* variable sized array */
+} xfs_attr_shortform_t;
+typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
+typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
+
+/*
+ * We generate this then sort it, attr_list() must return things in hash-order.
+ */
+typedef struct xfs_attr_sf_sort {
+ __uint8_t entno; /* entry number in original list */
+ __uint8_t namelen; /* length of name value (no null) */
+ __uint8_t valuelen; /* length of value */
+ __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
+ xfs_dahash_t hash; /* this entry's hash value */
+ unsigned char *name; /* name value, pointer into buffer */
+} xfs_attr_sf_sort_t;
+
+#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
+ (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
+#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \
+ ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
+#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \
+ ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
+#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \
+ ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
+#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \
+ (be16_to_cpu(((xfs_attr_shortform_t *) \
+ ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
+
+#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
new file mode 100644
index 00000000..48228848
--- /dev/null
+++ b/fs/xfs/xfs_bit.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+
+/*
+ * XFS bit manipulation routines, used in non-realtime code.
+ */
+
+/*
+ * Return whether bitmap is empty.
+ * Size is number of words in the bitmap, which is padded to word boundary
+ * Returns 1 for empty, 0 for non-empty.
+ */
+int
+xfs_bitmap_empty(uint *map, uint size)
+{
+ uint i;
+ uint ret = 0;
+
+ for (i = 0; i < size; i++) {
+ ret |= map[i];
+ }
+
+ return (ret == 0);
+}
+
+/*
+ * Count the number of contiguous bits set in the bitmap starting with bit
+ * start_bit. Size is the size of the bitmap in words.
+ */
+int
+xfs_contig_bits(uint *map, uint size, uint start_bit)
+{
+ uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+ uint result = 0;
+ uint tmp;
+
+ size <<= BIT_TO_WORD_SHIFT;
+
+ ASSERT(start_bit < size);
+ size -= start_bit & ~(NBWORD - 1);
+ start_bit &= (NBWORD - 1);
+ if (start_bit) {
+ tmp = *p++;
+ /* set to one first offset bits prior to start */
+ tmp |= (~0U >> (NBWORD-start_bit));
+ if (tmp != ~0U)
+ goto found;
+ result += NBWORD;
+ size -= NBWORD;
+ }
+ while (size) {
+ if ((tmp = *p++) != ~0U)
+ goto found;
+ result += NBWORD;
+ size -= NBWORD;
+ }
+ return result - start_bit;
+found:
+ return result + ffz(tmp) - start_bit;
+}
+
+/*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there. It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ *
+ * Size is the number of words, not bytes, in the bitmap.
+ */
+int xfs_next_bit(uint *map, uint size, uint start_bit)
+{
+ uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+ uint result = start_bit & ~(NBWORD - 1);
+ uint tmp;
+
+ size <<= BIT_TO_WORD_SHIFT;
+
+ if (start_bit >= size)
+ return -1;
+ size -= result;
+ start_bit &= (NBWORD - 1);
+ if (start_bit) {
+ tmp = *p++;
+ /* set to zero first offset bits prior to start */
+ tmp &= (~0U << start_bit);
+ if (tmp != 0U)
+ goto found;
+ result += NBWORD;
+ size -= NBWORD;
+ }
+ while (size) {
+ if ((tmp = *p++) != 0U)
+ goto found;
+ result += NBWORD;
+ size -= NBWORD;
+ }
+ return -1;
+found:
+ return result + ffs(tmp) - 1;
+}
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
new file mode 100644
index 00000000..f1e3c907
--- /dev/null
+++ b/fs/xfs/xfs_bit.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_BIT_H__
+#define __XFS_BIT_H__
+
+/*
+ * XFS bit manipulation routines.
+ */
+
+/*
+ * masks with n high/low bits set, 64-bit values
+ */
+static inline __uint64_t xfs_mask64hi(int n)
+{
+ return (__uint64_t)-1 << (64 - (n));
+}
+static inline __uint32_t xfs_mask32lo(int n)
+{
+ return ((__uint32_t)1 << (n)) - 1;
+}
+static inline __uint64_t xfs_mask64lo(int n)
+{
+ return ((__uint64_t)1 << (n)) - 1;
+}
+
+/* Get high bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_highbit32(__uint32_t v)
+{
+ return fls(v) - 1;
+}
+
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(__uint64_t v)
+{
+ return fls64(v) - 1;
+}
+
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(__uint32_t v)
+{
+ return ffs(v) - 1;
+}
+
+/* Get low bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_lowbit64(__uint64_t v)
+{
+ __uint32_t w = (__uint32_t)v;
+ int n = 0;
+
+ if (w) { /* lower bits */
+ n = ffs(w);
+ } else { /* upper bits */
+ w = (__uint32_t)(v >> 32);
+ if (w && (n = ffs(w)))
+ n += 32;
+ }
+ return n - 1;
+}
+
+/* Return whether bitmap is empty (1 == empty) */
+extern int xfs_bitmap_empty(uint *map, uint size);
+
+/* Count continuous one bits in map starting with start_bit */
+extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
+
+/* Find next set bit in map */
+extern int xfs_next_bit(uint *map, uint size, uint start_bit);
+
+#endif /* __XFS_BIT_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
new file mode 100644
index 00000000..a175933a
--- /dev/null
+++ b/fs/xfs/xfs_bmap.c
@@ -0,0 +1,6139 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_mount.h"
+#include "xfs_itable.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_inode_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_rw.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_buf_item.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+
+#ifdef DEBUG
+STATIC void
+xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
+#endif
+
+kmem_zone_t *xfs_bmap_free_item_zone;
+
+/*
+ * Prototypes for internal bmap routines.
+ */
+
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int /* error */
+xfs_bmap_add_attrfork_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first block allocated */
+ xfs_bmap_free_t *flist, /* blocks to free at commit */
+ int *flags); /* inode logging flags */
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files.
+ */
+STATIC int /* error */
+xfs_bmap_add_attrfork_local(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first block allocated */
+ xfs_bmap_free_t *flist, /* blocks to free at commit */
+ int *flags); /* inode logging flags */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a delayed
+ * allocation to a real allocation.
+ */
+STATIC int /* error */
+xfs_bmap_add_extent_delay_real(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t *idx, /* extent number to update/insert */
+ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
+ xfs_fsblock_t *first, /* pointer to firstblock variable */
+ xfs_bmap_free_t *flist, /* list of extents to be freed */
+ int *logflagsp); /* inode logging flags */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a delayed allocation.
+ */
+STATIC int /* error */
+xfs_bmap_add_extent_hole_delay(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t *idx, /* extent number to update/insert */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ int *logflagsp); /* inode logging flags */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a real allocation.
+ */
+STATIC int /* error */
+xfs_bmap_add_extent_hole_real(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t *idx, /* extent number to update/insert */
+ xfs_btree_cur_t *cur, /* if null, not a btree */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ int *logflagsp, /* inode logging flags */
+ int whichfork); /* data or attr fork */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting an unwritten
+ * allocation to a real allocation or vice versa.
+ */
+STATIC int /* error */
+xfs_bmap_add_extent_unwritten_real(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t *idx, /* extent number to update/insert */
+ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ int *logflagsp); /* inode logging flags */
+
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int /* error */
+xfs_bmap_alloc(
+ xfs_bmalloca_t *ap); /* bmap alloc argument struct */
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int /* error */
+xfs_bmap_btree_to_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int *logflagsp, /* inode logging flags */
+ int whichfork); /* data or attr fork */
+
+/*
+ * Remove the entry "free" from the free item list. Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+ xfs_bmap_free_t *flist, /* free item list header */
+ xfs_bmap_free_item_t *prev, /* previous item on list, if any */
+ xfs_bmap_free_item_t *free); /* list item to be freed */
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int /* error */
+xfs_bmap_extents_to_btree(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first-block-allocated */
+ xfs_bmap_free_t *flist, /* blocks freed in xaction */
+ xfs_btree_cur_t **curp, /* cursor returned to caller */
+ int wasdel, /* converting a delayed alloc */
+ int *logflagsp, /* inode logging flags */
+ int whichfork); /* data or attr fork */
+
+/*
+ * Convert a local file to an extents file.
+ * This code is sort of bogus, since the file data needs to get
+ * logged so it won't be lost. The bmap-level manipulations are ok, though.
+ */
+STATIC int /* error */
+xfs_bmap_local_to_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first block allocated in xaction */
+ xfs_extlen_t total, /* total blocks needed by transaction */
+ int *logflagsp, /* inode logging flags */
+ int whichfork); /* data or attr fork */
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry. If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmap_search_extents(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fileoff_t bno, /* block number searched for */
+ int whichfork, /* data or attr fork */
+ int *eofp, /* out: end of file found */
+ xfs_extnum_t *lastxp, /* out: last extent index */
+ xfs_bmbt_irec_t *gotp, /* out: extent entry found */
+ xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ */
+STATIC int /* error */
+xfs_bmap_isaeof(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fileoff_t off, /* file offset in fsblocks */
+ int whichfork, /* data or attribute fork */
+ char *aeof); /* return value */
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_filblks_t len); /* delayed extent length */
+
+#ifdef DEBUG
+/*
+ * Perform various validation checks on the values being returned
+ * from xfs_bmapi().
+ */
+STATIC void
+xfs_bmap_validate_ret(
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ int flags,
+ xfs_bmbt_irec_t *mval,
+ int nmap,
+ int ret_nmap);
+#else
+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+
+STATIC int
+xfs_bmap_count_tree(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ xfs_ifork_t *ifp,
+ xfs_fsblock_t blockno,
+ int levelin,
+ int *count);
+
+STATIC void
+xfs_bmap_count_leaves(
+ xfs_ifork_t *ifp,
+ xfs_extnum_t idx,
+ int numrecs,
+ int *count);
+
+STATIC void
+xfs_bmap_disk_count_leaves(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
+ int numrecs,
+ int *count);
+
+/*
+ * Bmap internal routines.
+ */
+
+STATIC int /* error */
+xfs_bmbt_lookup_eq(
+ struct xfs_btree_cur *cur,
+ xfs_fileoff_t off,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.b.br_startoff = off;
+ cur->bc_rec.b.br_startblock = bno;
+ cur->bc_rec.b.br_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+STATIC int /* error */
+xfs_bmbt_lookup_ge(
+ struct xfs_btree_cur *cur,
+ xfs_fileoff_t off,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.b.br_startoff = off;
+ cur->bc_rec.b.br_startblock = bno;
+ cur->bc_rec.b.br_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+* Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+ struct xfs_btree_cur *cur,
+ xfs_fileoff_t off,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ xfs_exntst_t state)
+{
+ union xfs_btree_rec rec;
+
+ xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+ return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle btree format files.
+ */
+STATIC int /* error */
+xfs_bmap_add_attrfork_btree(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first block allocated */
+ xfs_bmap_free_t *flist, /* blocks to free at commit */
+ int *flags) /* inode logging flags */
+{
+ xfs_btree_cur_t *cur; /* btree cursor */
+ int error; /* error return value */
+ xfs_mount_t *mp; /* file system mount struct */
+ int stat; /* newroot status */
+
+ mp = ip->i_mount;
+ if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
+ *flags |= XFS_ILOG_DBROOT;
+ else {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+ cur->bc_private.b.flist = flist;
+ cur->bc_private.b.firstblock = *firstblock;
+ if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
+ goto error0;
+ /* must be at least one entry */
+ XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
+ if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
+ goto error0;
+ if (stat == 0) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return XFS_ERROR(ENOSPC);
+ }
+ *firstblock = cur->bc_private.b.firstblock;
+ cur->bc_private.b.allocated = 0;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ }
+ return 0;
+error0:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int /* error */
+xfs_bmap_add_attrfork_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first block allocated */
+ xfs_bmap_free_t *flist, /* blocks to free at commit */
+ int *flags) /* inode logging flags */
+{
+ xfs_btree_cur_t *cur; /* bmap btree cursor */
+ int error; /* error return value */
+
+ if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
+ return 0;
+ cur = NULL;
+ error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+ flags, XFS_DATA_FORK);
+ if (cur) {
+ cur->bc_private.b.allocated = 0;
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ }
+ return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files.
+ */
+STATIC int /* error */
+xfs_bmap_add_attrfork_local(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first block allocated */
+ xfs_bmap_free_t *flist, /* blocks to free at commit */
+ int *flags) /* inode logging flags */
+{
+ xfs_da_args_t dargs; /* args for dir/attr code */
+ int error; /* error return value */
+ xfs_mount_t *mp; /* mount structure pointer */
+
+ if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+ return 0;
+ if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+ mp = ip->i_mount;
+ memset(&dargs, 0, sizeof(dargs));
+ dargs.dp = ip;
+ dargs.firstblock = firstblock;
+ dargs.flist = flist;
+ dargs.total = mp->m_dirblkfsbs;
+ dargs.whichfork = XFS_DATA_FORK;
+ dargs.trans = tp;
+ error = xfs_dir2_sf_to_block(&dargs);
+ } else
+ error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
+ XFS_DATA_FORK);
+ return error;
+}
+
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after allocating space (or doing a delayed allocation).
+ */
+STATIC int /* error */
+xfs_bmap_add_extent(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t *idx, /* extent number to update/insert */
+ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ xfs_fsblock_t *first, /* pointer to firstblock variable */
+ xfs_bmap_free_t *flist, /* list of extents to be freed */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ xfs_btree_cur_t *cur; /* btree cursor or null */
+ xfs_filblks_t da_new; /* new count del alloc blocks used */
+ xfs_filblks_t da_old; /* old count del alloc blocks used */
+ int error; /* error return value */
+ xfs_ifork_t *ifp; /* inode fork ptr */
+ int logflags; /* returned value */
+ xfs_extnum_t nextents; /* number of extents in file now */
+
+ XFS_STATS_INC(xs_add_exlist);
+
+ cur = *curp;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ da_old = da_new = 0;
+ error = 0;
+
+ ASSERT(*idx >= 0);
+ ASSERT(*idx <= nextents);
+
+ /*
+ * This is the first extent added to a new/empty file.
+ * Special case this one, so other routines get to assume there are
+ * already extents in the list.
+ */
+ if (nextents == 0) {
+ xfs_iext_insert(ip, *idx, 1, new,
+ whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+
+ ASSERT(cur == NULL);
+
+ if (!isnullstartblock(new->br_startblock)) {
+ XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+ logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ } else
+ logflags = 0;
+ }
+ /*
+ * Any kind of new delayed allocation goes here.
+ */
+ else if (isnullstartblock(new->br_startblock)) {
+ if (cur)
+ ASSERT((cur->bc_private.b.flags &
+ XFS_BTCUR_BPRV_WASDEL) == 0);
+ error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
+ &logflags);
+ }
+ /*
+ * Real allocation off the end of the file.
+ */
+ else if (*idx == nextents) {
+ if (cur)
+ ASSERT((cur->bc_private.b.flags &
+ XFS_BTCUR_BPRV_WASDEL) == 0);
+ error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+ &logflags, whichfork);
+ } else {
+ xfs_bmbt_irec_t prev; /* old extent at offset idx */
+
+ /*
+ * Get the record referred to by idx.
+ */
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
+ /*
+ * If it's a real allocation record, and the new allocation ends
+ * after the start of the referred to record, then we're filling
+ * in a delayed or unwritten allocation with a real one, or
+ * converting real back to unwritten.
+ */
+ if (!isnullstartblock(new->br_startblock) &&
+ new->br_startoff + new->br_blockcount > prev.br_startoff) {
+ if (prev.br_state != XFS_EXT_UNWRITTEN &&
+ isnullstartblock(prev.br_startblock)) {
+ da_old = startblockval(prev.br_startblock);
+ if (cur)
+ ASSERT(cur->bc_private.b.flags &
+ XFS_BTCUR_BPRV_WASDEL);
+ error = xfs_bmap_add_extent_delay_real(ip,
+ idx, &cur, new, &da_new,
+ first, flist, &logflags);
+ } else {
+ ASSERT(new->br_state == XFS_EXT_NORM ||
+ new->br_state == XFS_EXT_UNWRITTEN);
+
+ error = xfs_bmap_add_extent_unwritten_real(ip,
+ idx, &cur, new, &logflags);
+ if (error)
+ goto done;
+ }
+ }
+ /*
+ * Otherwise we're filling in a hole with an allocation.
+ */
+ else {
+ if (cur)
+ ASSERT((cur->bc_private.b.flags &
+ XFS_BTCUR_BPRV_WASDEL) == 0);
+ error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+ new, &logflags, whichfork);
+ }
+ }
+
+ if (error)
+ goto done;
+ ASSERT(*curp == cur || *curp == NULL);
+
+ /*
+ * Convert to a btree if necessary.
+ */
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first,
+ flist, &cur, da_old > 0, &tmp_logflags, whichfork);
+ logflags |= tmp_logflags;
+ if (error)
+ goto done;
+ }
+ /*
+ * Adjust for changes in reserved delayed indirect blocks.
+ * Nothing to do for disk quotas here.
+ */
+ if (da_old || da_new) {
+ xfs_filblks_t nblks;
+
+ nblks = da_new;
+ if (cur)
+ nblks += cur->bc_private.b.allocated;
+ ASSERT(nblks <= da_old);
+ if (nblks < da_old)
+ xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+ (int64_t)(da_old - nblks), 0);
+ }
+ /*
+ * Clear out the allocated field, done with it now in any case.
+ */
+ if (cur) {
+ cur->bc_private.b.allocated = 0;
+ *curp = cur;
+ }
+done:
+#ifdef DEBUG
+ if (!error)
+ xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
+#endif
+ *logflagsp = logflags;
+ return error;
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a delayed
+ * allocation to a real allocation.
+ */
+STATIC int /* error */
+xfs_bmap_add_extent_delay_real(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t *idx, /* extent number to update/insert */
+ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
+ xfs_fsblock_t *first, /* pointer to firstblock variable */
+ xfs_bmap_free_t *flist, /* list of extents to be freed */
+ int *logflagsp) /* inode logging flags */
+{
+ xfs_btree_cur_t *cur; /* btree cursor */
+ int diff; /* temp value */
+ xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
+ int error; /* error return value */
+ int i; /* temp state */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_fileoff_t new_endoff; /* end offset of new entry */
+ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
+ /* left is 0, right is 1, prev is 2 */
+ int rval=0; /* return value (logging flags) */
+ int state = 0;/* state bits, accessed thru macros */
+ xfs_filblks_t temp=0; /* value for dnew calculations */
+ xfs_filblks_t temp2=0;/* value for dnew calculations */
+ int tmp_rval; /* partial logging flags */
+
+#define LEFT r[0]
+#define RIGHT r[1]
+#define PREV r[2]
+
+ /*
+ * Set up a bunch of variables to make the tests simpler.
+ */
+ cur = *curp;
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ ep = xfs_iext_get_ext(ifp, *idx);
+ xfs_bmbt_get_all(ep, &PREV);
+ new_endoff = new->br_startoff + new->br_blockcount;
+ ASSERT(PREV.br_startoff <= new->br_startoff);
+ ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+ /*
+ * Set flags determining what part of the previous delayed allocation
+ * extent is being replaced by a real allocation.
+ */
+ if (PREV.br_startoff == new->br_startoff)
+ state |= BMAP_LEFT_FILLING;
+ if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+ state |= BMAP_RIGHT_FILLING;
+
+ /*
+ * Check and set flags if this segment has a left neighbor.
+ * Don't set contiguous if the combined extent would be too large.
+ */
+ if (*idx > 0) {
+ state |= BMAP_LEFT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+
+ if (isnullstartblock(LEFT.br_startblock))
+ state |= BMAP_LEFT_DELAY;
+ }
+
+ if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+ LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+ LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+ LEFT.br_state == new->br_state &&
+ LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ /*
+ * Check and set flags if this segment has a right neighbor.
+ * Don't set contiguous if the combined extent would be too large.
+ * Also check for all-three-contiguous being too large.
+ */
+ if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ state |= BMAP_RIGHT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+
+ if (isnullstartblock(RIGHT.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
+ }
+
+ if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+ new_endoff == RIGHT.br_startoff &&
+ new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+ new->br_state == RIGHT.br_state &&
+ new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING)) !=
+ (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING) ||
+ LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+ <= MAXEXTLEN))
+ state |= BMAP_RIGHT_CONTIG;
+
+ error = 0;
+ /*
+ * Switch out based on the FILLING and CONTIG state bits.
+ */
+ switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Filling in all of a previously delayed allocation extent.
+ * The left and right neighbors are both contiguous with new.
+ */
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+ LEFT.br_blockcount + PREV.br_blockcount +
+ RIGHT.br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 2, state);
+ ip->i_d.di_nextents--;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+ RIGHT.br_startblock,
+ RIGHT.br_blockcount, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+ LEFT.br_startblock,
+ LEFT.br_blockcount +
+ PREV.br_blockcount +
+ RIGHT.br_blockcount, LEFT.br_state)))
+ goto done;
+ }
+ *dnew = 0;
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+ /*
+ * Filling in all of a previously delayed allocation extent.
+ * The left neighbor is contiguous, the right is not.
+ */
+ --*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+ LEFT.br_blockcount + PREV.br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 1, state);
+ if (cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+ LEFT.br_startblock, LEFT.br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+ LEFT.br_startblock,
+ LEFT.br_blockcount +
+ PREV.br_blockcount, LEFT.br_state)))
+ goto done;
+ }
+ *dnew = 0;
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Filling in all of a previously delayed allocation extent.
+ * The right neighbor is contiguous, the left is not.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(ep, new->br_startblock);
+ xfs_bmbt_set_blockcount(ep,
+ PREV.br_blockcount + RIGHT.br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 1, state);
+ if (cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+ RIGHT.br_startblock,
+ RIGHT.br_blockcount, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+ new->br_startblock,
+ PREV.br_blockcount +
+ RIGHT.br_blockcount, PREV.br_state)))
+ goto done;
+ }
+
+ *dnew = 0;
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+ /*
+ * Filling in all of a previously delayed allocation extent.
+ * Neither the left nor right neighbors are contiguous with
+ * the new one.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(ep, new->br_startblock);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ip->i_d.di_nextents++;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ new->br_startblock, new->br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ }
+
+ *dnew = 0;
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+ /*
+ * Filling in the first part of a previous delayed allocation.
+ * The left neighbor is contiguous.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+ LEFT.br_blockcount + new->br_blockcount);
+ xfs_bmbt_set_startoff(ep,
+ PREV.br_startoff + new->br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+
+ temp = PREV.br_blockcount - new->br_blockcount;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp);
+ if (cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+ LEFT.br_startblock, LEFT.br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+ LEFT.br_startblock,
+ LEFT.br_blockcount +
+ new->br_blockcount,
+ LEFT.br_state)))
+ goto done;
+ }
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ startblockval(PREV.br_startblock));
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ --*idx;
+ *dnew = temp;
+ break;
+
+ case BMAP_LEFT_FILLING:
+ /*
+ * Filling in the first part of a previous delayed allocation.
+ * The left neighbor is not contiguous.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_startoff(ep, new_endoff);
+ temp = PREV.br_blockcount - new->br_blockcount;
+ xfs_bmbt_set_blockcount(ep, temp);
+ xfs_iext_insert(ip, *idx, 1, new, state);
+ ip->i_d.di_nextents++;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ new->br_startblock, new->br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ }
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+ error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+ first, flist, &cur, 1, &tmp_rval,
+ XFS_DATA_FORK);
+ rval |= tmp_rval;
+ if (error)
+ goto done;
+ }
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ startblockval(PREV.br_startblock) -
+ (cur ? cur->bc_private.b.allocated : 0));
+ ep = xfs_iext_get_ext(ifp, *idx + 1);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
+
+ *dnew = temp;
+ break;
+
+ case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Filling in the last part of a previous delayed allocation.
+ * The right neighbor is contiguous with the new allocation.
+ */
+ temp = PREV.br_blockcount - new->br_blockcount;
+ trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
+ new->br_startoff, new->br_startblock,
+ new->br_blockcount + RIGHT.br_blockcount,
+ RIGHT.br_state);
+ trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
+ if (cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+ RIGHT.br_startblock,
+ RIGHT.br_blockcount, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, new->br_startoff,
+ new->br_startblock,
+ new->br_blockcount +
+ RIGHT.br_blockcount,
+ RIGHT.br_state)))
+ goto done;
+ }
+
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ startblockval(PREV.br_startblock));
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
+ *dnew = temp;
+ break;
+
+ case BMAP_RIGHT_FILLING:
+ /*
+ * Filling in the last part of a previous delayed allocation.
+ * The right neighbor is not contiguous.
+ */
+ temp = PREV.br_blockcount - new->br_blockcount;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp);
+ xfs_iext_insert(ip, *idx + 1, 1, new, state);
+ ip->i_d.di_nextents++;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ new->br_startblock, new->br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ }
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+ error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+ first, flist, &cur, 1, &tmp_rval,
+ XFS_DATA_FORK);
+ rval |= tmp_rval;
+ if (error)
+ goto done;
+ }
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ startblockval(PREV.br_startblock) -
+ (cur ? cur->bc_private.b.allocated : 0));
+ ep = xfs_iext_get_ext(ifp, *idx);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
+ *dnew = temp;
+ break;
+
+ case 0:
+ /*
+ * Filling in the middle part of a previous delayed allocation.
+ * Contiguity is impossible here.
+ * This case is avoided almost all the time.
+ *
+ * We start with a delayed allocation:
+ *
+ * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+ * PREV @ idx
+ *
+ * and we are allocating:
+ * +rrrrrrrrrrrrrrrrr+
+ * new
+ *
+ * and we set it up for insertion as:
+ * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+ * new
+ * PREV @ idx LEFT RIGHT
+ * inserted at idx + 1
+ */
+ temp = new->br_startoff - PREV.br_startoff;
+ temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
+ trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
+ LEFT = *new;
+ RIGHT.br_state = PREV.br_state;
+ RIGHT.br_startblock = nullstartblock(
+ (int)xfs_bmap_worst_indlen(ip, temp2));
+ RIGHT.br_startoff = new_endoff;
+ RIGHT.br_blockcount = temp2;
+ /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+ xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
+ ip->i_d.di_nextents++;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ new->br_startblock, new->br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ }
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+ error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+ first, flist, &cur, 1, &tmp_rval,
+ XFS_DATA_FORK);
+ rval |= tmp_rval;
+ if (error)
+ goto done;
+ }
+ temp = xfs_bmap_worst_indlen(ip, temp);
+ temp2 = xfs_bmap_worst_indlen(ip, temp2);
+ diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
+ (cur ? cur->bc_private.b.allocated : 0));
+ if (diff > 0 &&
+ xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+ -((int64_t)diff), 0)) {
+ /*
+ * Ick gross gag me with a spoon.
+ */
+ ASSERT(0); /* want to see if this ever happens! */
+ while (diff > 0) {
+ if (temp) {
+ temp--;
+ diff--;
+ if (!diff ||
+ !xfs_icsb_modify_counters(ip->i_mount,
+ XFS_SBS_FDBLOCKS,
+ -((int64_t)diff), 0))
+ break;
+ }
+ if (temp2) {
+ temp2--;
+ diff--;
+ if (!diff ||
+ !xfs_icsb_modify_counters(ip->i_mount,
+ XFS_SBS_FDBLOCKS,
+ -((int64_t)diff), 0))
+ break;
+ }
+ }
+ }
+ ep = xfs_iext_get_ext(ifp, *idx);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
+ nullstartblock((int)temp2));
+ trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+
+ ++*idx;
+ *dnew = temp + temp2;
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_CONTIG:
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * These cases are all impossible.
+ */
+ ASSERT(0);
+ }
+ *curp = cur;
+done:
+ *logflagsp = rval;
+ return error;
+#undef LEFT
+#undef RIGHT
+#undef PREV
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting an unwritten
+ * allocation to a real allocation or vice versa.
+ */
+STATIC int /* error */
+xfs_bmap_add_extent_unwritten_real(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t *idx, /* extent number to update/insert */
+ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ int *logflagsp) /* inode logging flags */
+{
+ xfs_btree_cur_t *cur; /* btree cursor */
+ xfs_bmbt_rec_host_t *ep; /* extent entry for idx */
+ int error; /* error return value */
+ int i; /* temp state */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_fileoff_t new_endoff; /* end offset of new entry */
+ xfs_exntst_t newext; /* new extent state */
+ xfs_exntst_t oldext; /* old extent state */
+ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
+ /* left is 0, right is 1, prev is 2 */
+ int rval=0; /* return value (logging flags) */
+ int state = 0;/* state bits, accessed thru macros */
+
+#define LEFT r[0]
+#define RIGHT r[1]
+#define PREV r[2]
+ /*
+ * Set up a bunch of variables to make the tests simpler.
+ */
+ error = 0;
+ cur = *curp;
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ ep = xfs_iext_get_ext(ifp, *idx);
+ xfs_bmbt_get_all(ep, &PREV);
+ newext = new->br_state;
+ oldext = (newext == XFS_EXT_UNWRITTEN) ?
+ XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+ ASSERT(PREV.br_state == oldext);
+ new_endoff = new->br_startoff + new->br_blockcount;
+ ASSERT(PREV.br_startoff <= new->br_startoff);
+ ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+
+ /*
+ * Set flags determining what part of the previous oldext allocation
+ * extent is being replaced by a newext allocation.
+ */
+ if (PREV.br_startoff == new->br_startoff)
+ state |= BMAP_LEFT_FILLING;
+ if (PREV.br_startoff + PREV.br_blockcount == new_endoff)
+ state |= BMAP_RIGHT_FILLING;
+
+ /*
+ * Check and set flags if this segment has a left neighbor.
+ * Don't set contiguous if the combined extent would be too large.
+ */
+ if (*idx > 0) {
+ state |= BMAP_LEFT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
+
+ if (isnullstartblock(LEFT.br_startblock))
+ state |= BMAP_LEFT_DELAY;
+ }
+
+ if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+ LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+ LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+ LEFT.br_state == newext &&
+ LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ /*
+ * Check and set flags if this segment has a right neighbor.
+ * Don't set contiguous if the combined extent would be too large.
+ * Also check for all-three-contiguous being too large.
+ */
+ if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ state |= BMAP_RIGHT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+ if (isnullstartblock(RIGHT.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
+ }
+
+ if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+ new_endoff == RIGHT.br_startoff &&
+ new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
+ newext == RIGHT.br_state &&
+ new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING)) !=
+ (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
+ BMAP_RIGHT_FILLING) ||
+ LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+ <= MAXEXTLEN))
+ state |= BMAP_RIGHT_CONTIG;
+
+ /*
+ * Switch out based on the FILLING and CONTIG state bits.
+ */
+ switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG |
+ BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left and right neighbors are both contiguous with new.
+ */
+ --*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+ LEFT.br_blockcount + PREV.br_blockcount +
+ RIGHT.br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 2, state);
+ ip->i_d.di_nextents -= 2;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+ RIGHT.br_startblock,
+ RIGHT.br_blockcount, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+ LEFT.br_startblock,
+ LEFT.br_blockcount + PREV.br_blockcount +
+ RIGHT.br_blockcount, LEFT.br_state)))
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The left neighbor is contiguous, the right is not.
+ */
+ --*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+ LEFT.br_blockcount + PREV.br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 1, state);
+ ip->i_d.di_nextents--;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+ PREV.br_startblock, PREV.br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+ LEFT.br_startblock,
+ LEFT.br_blockcount + PREV.br_blockcount,
+ LEFT.br_state)))
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * The right neighbor is contiguous, the left is not.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep,
+ PREV.br_blockcount + RIGHT.br_blockcount);
+ xfs_bmbt_set_state(ep, newext);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
+ ip->i_d.di_nextents--;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+ RIGHT.br_startblock,
+ RIGHT.br_blockcount, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, new->br_startoff,
+ new->br_startblock,
+ new->br_blockcount + RIGHT.br_blockcount,
+ newext)))
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
+ /*
+ * Setting all of a previous oldext extent to newext.
+ * Neither the left nor right neighbors are contiguous with
+ * the new one.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_state(ep, newext);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ if (cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ new->br_startblock, new->br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, new->br_startoff,
+ new->br_startblock, new->br_blockcount,
+ newext)))
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is contiguous.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
+ LEFT.br_blockcount + new->br_blockcount);
+ xfs_bmbt_set_startoff(ep,
+ PREV.br_startoff + new->br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(ep,
+ new->br_startblock + new->br_blockcount);
+ xfs_bmbt_set_blockcount(ep,
+ PREV.br_blockcount - new->br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ --*idx;
+
+ if (cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+ PREV.br_startblock, PREV.br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur,
+ PREV.br_startoff + new->br_blockcount,
+ PREV.br_startblock + new->br_blockcount,
+ PREV.br_blockcount - new->br_blockcount,
+ oldext)))
+ goto done;
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
+ goto done;
+ if (xfs_bmbt_update(cur, LEFT.br_startoff,
+ LEFT.br_startblock,
+ LEFT.br_blockcount + new->br_blockcount,
+ LEFT.br_state))
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_FILLING:
+ /*
+ * Setting the first part of a previous oldext extent to newext.
+ * The left neighbor is not contiguous.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
+ xfs_bmbt_set_startoff(ep, new_endoff);
+ xfs_bmbt_set_blockcount(ep,
+ PREV.br_blockcount - new->br_blockcount);
+ xfs_bmbt_set_startblock(ep,
+ new->br_startblock + new->br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_insert(ip, *idx, 1, new, state);
+ ip->i_d.di_nextents++;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+ PREV.br_startblock, PREV.br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur,
+ PREV.br_startoff + new->br_blockcount,
+ PREV.br_startblock + new->br_blockcount,
+ PREV.br_blockcount - new->br_blockcount,
+ oldext)))
+ goto done;
+ cur->bc_rec.b = *new;
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ }
+ break;
+
+ case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is contiguous with the new allocation.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep,
+ PREV.br_blockcount - new->br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ new->br_startoff, new->br_startblock,
+ new->br_blockcount + RIGHT.br_blockcount, newext);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ if (cur == NULL)
+ rval = XFS_ILOG_DEXT;
+ else {
+ rval = 0;
+ if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+ PREV.br_startblock,
+ PREV.br_blockcount, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+ PREV.br_startblock,
+ PREV.br_blockcount - new->br_blockcount,
+ oldext)))
+ goto done;
+ if ((error = xfs_btree_increment(cur, 0, &i)))
+ goto done;
+ if ((error = xfs_bmbt_update(cur, new->br_startoff,
+ new->br_startblock,
+ new->br_blockcount + RIGHT.br_blockcount,
+ newext)))
+ goto done;
+ }
+ break;
+
+ case BMAP_RIGHT_FILLING:
+ /*
+ * Setting the last part of a previous oldext extent to newext.
+ * The right neighbor is not contiguous.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep,
+ PREV.br_blockcount - new->br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
+
+ ip->i_d.di_nextents++;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+ PREV.br_startblock, PREV.br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+ PREV.br_startblock,
+ PREV.br_blockcount - new->br_blockcount,
+ oldext)))
+ goto done;
+ if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ new->br_startblock, new->br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ cur->bc_rec.b.br_state = XFS_EXT_NORM;
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ }
+ break;
+
+ case 0:
+ /*
+ * Setting the middle part of a previous oldext extent to
+ * newext. Contiguity is impossible here.
+ * One extent becomes three extents.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep,
+ new->br_startoff - PREV.br_startoff);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ r[0] = *new;
+ r[1].br_startoff = new_endoff;
+ r[1].br_blockcount =
+ PREV.br_startoff + PREV.br_blockcount - new_endoff;
+ r[1].br_startblock = new->br_startblock + new->br_blockcount;
+ r[1].br_state = oldext;
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 2, &r[0], state);
+
+ ip->i_d.di_nextents += 2;
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+ PREV.br_startblock, PREV.br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ /* new right extent - oldext */
+ if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
+ r[1].br_startblock, r[1].br_blockcount,
+ r[1].br_state)))
+ goto done;
+ /* new left extent - oldext */
+ cur->bc_rec.b = PREV;
+ cur->bc_rec.b.br_blockcount =
+ new->br_startoff - PREV.br_startoff;
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ /*
+ * Reset the cursor to the position of the new extent
+ * we are about to insert as we can't trust it after
+ * the previous insert.
+ */
+ if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ new->br_startblock, new->br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ /* new middle extent - newext */
+ cur->bc_rec.b.br_state = new->br_state;
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ }
+ break;
+
+ case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG:
+ case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ case BMAP_LEFT_CONTIG:
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * These cases are all impossible.
+ */
+ ASSERT(0);
+ }
+ *curp = cur;
+done:
+ *logflagsp = rval;
+ return error;
+#undef LEFT
+#undef RIGHT
+#undef PREV
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a delayed allocation.
+ */
+/*ARGSUSED*/
+STATIC int /* error */
+xfs_bmap_add_extent_hole_delay(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t *idx, /* extent number to update/insert */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ int *logflagsp) /* inode logging flags */
+{
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_irec_t left; /* left neighbor extent entry */
+ xfs_filblks_t newlen=0; /* new indirect size */
+ xfs_filblks_t oldlen=0; /* old indirect size */
+ xfs_bmbt_irec_t right; /* right neighbor extent entry */
+ int state; /* state bits, accessed thru macros */
+ xfs_filblks_t temp=0; /* temp for indirect calculations */
+
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ state = 0;
+ ASSERT(isnullstartblock(new->br_startblock));
+
+ /*
+ * Check and set flags if this segment has a left neighbor
+ */
+ if (*idx > 0) {
+ state |= BMAP_LEFT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+
+ if (isnullstartblock(left.br_startblock))
+ state |= BMAP_LEFT_DELAY;
+ }
+
+ /*
+ * Check and set flags if the current (right) segment exists.
+ * If it doesn't exist, we're converting the hole at end-of-file.
+ */
+ if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ state |= BMAP_RIGHT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+
+ if (isnullstartblock(right.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
+ }
+
+ /*
+ * Set contiguity flags on the left and right neighbors.
+ * Don't let extents get too large, even if the pieces are contiguous.
+ */
+ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+ left.br_startoff + left.br_blockcount == new->br_startoff &&
+ left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+ new->br_startoff + new->br_blockcount == right.br_startoff &&
+ new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ (!(state & BMAP_LEFT_CONTIG) ||
+ (left.br_blockcount + new->br_blockcount +
+ right.br_blockcount <= MAXEXTLEN)))
+ state |= BMAP_RIGHT_CONTIG;
+
+ /*
+ * Switch out based on the contiguity flags.
+ */
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with delayed allocations
+ * on the left and on the right.
+ * Merge all three into a single extent record.
+ */
+ --*idx;
+ temp = left.br_blockcount + new->br_blockcount +
+ right.br_blockcount;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+ newlen = xfs_bmap_worst_indlen(ip, temp);
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+ nullstartblock((int)newlen));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 1, state);
+ break;
+
+ case BMAP_LEFT_CONTIG:
+ /*
+ * New allocation is contiguous with a delayed allocation
+ * on the left.
+ * Merge the new allocation with the left neighbor.
+ */
+ --*idx;
+ temp = left.br_blockcount + new->br_blockcount;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock);
+ newlen = xfs_bmap_worst_indlen(ip, temp);
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+ nullstartblock((int)newlen));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ break;
+
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with a delayed allocation
+ * on the right.
+ * Merge the new allocation with the right neighbor.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ temp = new->br_blockcount + right.br_blockcount;
+ oldlen = startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+ newlen = xfs_bmap_worst_indlen(ip, temp);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ new->br_startoff,
+ nullstartblock((int)newlen), temp, right.br_state);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ break;
+
+ case 0:
+ /*
+ * New allocation is not contiguous with another
+ * delayed allocation.
+ * Insert a new entry.
+ */
+ oldlen = newlen = 0;
+ xfs_iext_insert(ip, *idx, 1, new, state);
+ break;
+ }
+ if (oldlen != newlen) {
+ ASSERT(oldlen > newlen);
+ xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+ (int64_t)(oldlen - newlen), 0);
+ /*
+ * Nothing to do for disk quota accounting here.
+ */
+ }
+ *logflagsp = 0;
+ return 0;
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a real allocation.
+ */
+STATIC int /* error */
+xfs_bmap_add_extent_hole_real(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t *idx, /* extent number to update/insert */
+ xfs_btree_cur_t *cur, /* if null, not a btree */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ int error; /* error return value */
+ int i; /* temp state */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_irec_t left; /* left neighbor extent entry */
+ xfs_bmbt_irec_t right; /* right neighbor extent entry */
+ int rval=0; /* return value (logging flags) */
+ int state; /* state bits, accessed thru macros */
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+ state = 0;
+
+ if (whichfork == XFS_ATTR_FORK)
+ state |= BMAP_ATTRFORK;
+
+ /*
+ * Check and set flags if this segment has a left neighbor.
+ */
+ if (*idx > 0) {
+ state |= BMAP_LEFT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
+ if (isnullstartblock(left.br_startblock))
+ state |= BMAP_LEFT_DELAY;
+ }
+
+ /*
+ * Check and set flags if this segment has a current value.
+ * Not true if we're inserting into the "hole" at eof.
+ */
+ if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ state |= BMAP_RIGHT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
+ if (isnullstartblock(right.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
+ }
+
+ /*
+ * We're inserting a real allocation between "left" and "right".
+ * Set the contiguity flags. Don't let extents get too large.
+ */
+ if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) &&
+ left.br_startoff + left.br_blockcount == new->br_startoff &&
+ left.br_startblock + left.br_blockcount == new->br_startblock &&
+ left.br_state == new->br_state &&
+ left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
+ new->br_startoff + new->br_blockcount == right.br_startoff &&
+ new->br_startblock + new->br_blockcount == right.br_startblock &&
+ new->br_state == right.br_state &&
+ new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ (!(state & BMAP_LEFT_CONTIG) ||
+ left.br_blockcount + new->br_blockcount +
+ right.br_blockcount <= MAXEXTLEN))
+ state |= BMAP_RIGHT_CONTIG;
+
+ error = 0;
+ /*
+ * Select which case we're in here, and implement it.
+ */
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with real allocations on the
+ * left and on the right.
+ * Merge all three into a single extent record.
+ */
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+ left.br_blockcount + new->br_blockcount +
+ right.br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 1, state);
+
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ if (cur == NULL) {
+ rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ } else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur,
+ right.br_startoff,
+ right.br_startblock,
+ right.br_blockcount, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, left.br_startoff,
+ left.br_startblock,
+ left.br_blockcount +
+ new->br_blockcount +
+ right.br_blockcount,
+ left.br_state)))
+ goto done;
+ }
+ break;
+
+ case BMAP_LEFT_CONTIG:
+ /*
+ * New allocation is contiguous with a real allocation
+ * on the left.
+ * Merge the new allocation with the left neighbor.
+ */
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
+ left.br_blockcount + new->br_blockcount);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ if (cur == NULL) {
+ rval = xfs_ilog_fext(whichfork);
+ } else {
+ rval = 0;
+ if ((error = xfs_bmbt_lookup_eq(cur,
+ left.br_startoff,
+ left.br_startblock,
+ left.br_blockcount, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, left.br_startoff,
+ left.br_startblock,
+ left.br_blockcount +
+ new->br_blockcount,
+ left.br_state)))
+ goto done;
+ }
+ break;
+
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with a real allocation
+ * on the right.
+ * Merge the new allocation with the right neighbor.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ new->br_startoff, new->br_startblock,
+ new->br_blockcount + right.br_blockcount,
+ right.br_state);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ if (cur == NULL) {
+ rval = xfs_ilog_fext(whichfork);
+ } else {
+ rval = 0;
+ if ((error = xfs_bmbt_lookup_eq(cur,
+ right.br_startoff,
+ right.br_startblock,
+ right.br_blockcount, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ if ((error = xfs_bmbt_update(cur, new->br_startoff,
+ new->br_startblock,
+ new->br_blockcount +
+ right.br_blockcount,
+ right.br_state)))
+ goto done;
+ }
+ break;
+
+ case 0:
+ /*
+ * New allocation is not contiguous with another
+ * real allocation.
+ * Insert a new entry.
+ */
+ xfs_iext_insert(ip, *idx, 1, new, state);
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ if (cur == NULL) {
+ rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ } else {
+ rval = XFS_ILOG_CORE;
+ if ((error = xfs_bmbt_lookup_eq(cur,
+ new->br_startoff,
+ new->br_startblock,
+ new->br_blockcount, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ cur->bc_rec.b.br_state = new->br_state;
+ if ((error = xfs_btree_insert(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ }
+ break;
+ }
+done:
+ *logflagsp = rval;
+ return error;
+}
+
+/*
+ * Adjust the size of the new extent based on di_extsize and rt extsize.
+ */
+STATIC int
+xfs_bmap_extsize_align(
+ xfs_mount_t *mp,
+ xfs_bmbt_irec_t *gotp, /* next extent pointer */
+ xfs_bmbt_irec_t *prevp, /* previous extent pointer */
+ xfs_extlen_t extsz, /* align to this extent size */
+ int rt, /* is this a realtime inode? */
+ int eof, /* is extent at end-of-file? */
+ int delay, /* creating delalloc extent? */
+ int convert, /* overwriting unwritten extent? */
+ xfs_fileoff_t *offp, /* in/out: aligned offset */
+ xfs_extlen_t *lenp) /* in/out: aligned length */
+{
+ xfs_fileoff_t orig_off; /* original offset */
+ xfs_extlen_t orig_alen; /* original length */
+ xfs_fileoff_t orig_end; /* original off+len */
+ xfs_fileoff_t nexto; /* next file offset */
+ xfs_fileoff_t prevo; /* previous file offset */
+ xfs_fileoff_t align_off; /* temp for offset */
+ xfs_extlen_t align_alen; /* temp for length */
+ xfs_extlen_t temp; /* temp for calculations */
+
+ if (convert)
+ return 0;
+
+ orig_off = align_off = *offp;
+ orig_alen = align_alen = *lenp;
+ orig_end = orig_off + orig_alen;
+
+ /*
+ * If this request overlaps an existing extent, then don't
+ * attempt to perform any additional alignment.
+ */
+ if (!delay && !eof &&
+ (orig_off >= gotp->br_startoff) &&
+ (orig_end <= gotp->br_startoff + gotp->br_blockcount)) {
+ return 0;
+ }
+
+ /*
+ * If the file offset is unaligned vs. the extent size
+ * we need to align it. This will be possible unless
+ * the file was previously written with a kernel that didn't
+ * perform this alignment, or if a truncate shot us in the
+ * foot.
+ */
+ temp = do_mod(orig_off, extsz);
+ if (temp) {
+ align_alen += temp;
+ align_off -= temp;
+ }
+ /*
+ * Same adjustment for the end of the requested area.
+ */
+ if ((temp = (align_alen % extsz))) {
+ align_alen += extsz - temp;
+ }
+ /*
+ * If the previous block overlaps with this proposed allocation
+ * then move the start forward without adjusting the length.
+ */
+ if (prevp->br_startoff != NULLFILEOFF) {
+ if (prevp->br_startblock == HOLESTARTBLOCK)
+ prevo = prevp->br_startoff;
+ else
+ prevo = prevp->br_startoff + prevp->br_blockcount;
+ } else
+ prevo = 0;
+ if (align_off != orig_off && align_off < prevo)
+ align_off = prevo;
+ /*
+ * If the next block overlaps with this proposed allocation
+ * then move the start back without adjusting the length,
+ * but not before offset 0.
+ * This may of course make the start overlap previous block,
+ * and if we hit the offset 0 limit then the next block
+ * can still overlap too.
+ */
+ if (!eof && gotp->br_startoff != NULLFILEOFF) {
+ if ((delay && gotp->br_startblock == HOLESTARTBLOCK) ||
+ (!delay && gotp->br_startblock == DELAYSTARTBLOCK))
+ nexto = gotp->br_startoff + gotp->br_blockcount;
+ else
+ nexto = gotp->br_startoff;
+ } else
+ nexto = NULLFILEOFF;
+ if (!eof &&
+ align_off + align_alen != orig_end &&
+ align_off + align_alen > nexto)
+ align_off = nexto > align_alen ? nexto - align_alen : 0;
+ /*
+ * If we're now overlapping the next or previous extent that
+ * means we can't fit an extsz piece in this hole. Just move
+ * the start forward to the first valid spot and set
+ * the length so we hit the end.
+ */
+ if (align_off != orig_off && align_off < prevo)
+ align_off = prevo;
+ if (align_off + align_alen != orig_end &&
+ align_off + align_alen > nexto &&
+ nexto != NULLFILEOFF) {
+ ASSERT(nexto > prevo);
+ align_alen = nexto - align_off;
+ }
+
+ /*
+ * If realtime, and the result isn't a multiple of the realtime
+ * extent size we need to remove blocks until it is.
+ */
+ if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) {
+ /*
+ * We're not covering the original request, or
+ * we won't be able to once we fix the length.
+ */
+ if (orig_off < align_off ||
+ orig_end > align_off + align_alen ||
+ align_alen - temp < orig_alen)
+ return XFS_ERROR(EINVAL);
+ /*
+ * Try to fix it by moving the start up.
+ */
+ if (align_off + temp <= orig_off) {
+ align_alen -= temp;
+ align_off += temp;
+ }
+ /*
+ * Try to fix it by moving the end in.
+ */
+ else if (align_off + align_alen - temp >= orig_end)
+ align_alen -= temp;
+ /*
+ * Set the start to the minimum then trim the length.
+ */
+ else {
+ align_alen -= orig_off - align_off;
+ align_off = orig_off;
+ align_alen -= align_alen % mp->m_sb.sb_rextsize;
+ }
+ /*
+ * Result doesn't cover the request, fail it.
+ */
+ if (orig_off < align_off || orig_end > align_off + align_alen)
+ return XFS_ERROR(EINVAL);
+ } else {
+ ASSERT(orig_off >= align_off);
+ ASSERT(orig_end <= align_off + align_alen);
+ }
+
+#ifdef DEBUG
+ if (!eof && gotp->br_startoff != NULLFILEOFF)
+ ASSERT(align_off + align_alen <= gotp->br_startoff);
+ if (prevp->br_startoff != NULLFILEOFF)
+ ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount);
+#endif
+
+ *lenp = align_alen;
+ *offp = align_off;
+ return 0;
+}
+
+#define XFS_ALLOC_GAP_UNITS 4
+
+STATIC void
+xfs_bmap_adjacent(
+ xfs_bmalloca_t *ap) /* bmap alloc argument struct */
+{
+ xfs_fsblock_t adjust; /* adjustment to block numbers */
+ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
+ xfs_mount_t *mp; /* mount point structure */
+ int nullfb; /* true if ap->firstblock isn't set */
+ int rt; /* true if inode is realtime */
+
+#define ISVALID(x,y) \
+ (rt ? \
+ (x) < mp->m_sb.sb_rblocks : \
+ XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
+ XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
+ XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
+
+ mp = ap->ip->i_mount;
+ nullfb = ap->firstblock == NULLFSBLOCK;
+ rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
+ fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+ /*
+ * If allocating at eof, and there's a previous real block,
+ * try to use its last block as our starting point.
+ */
+ if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
+ !isnullstartblock(ap->prevp->br_startblock) &&
+ ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
+ ap->prevp->br_startblock)) {
+ ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
+ /*
+ * Adjust for the gap between prevp and us.
+ */
+ adjust = ap->off -
+ (ap->prevp->br_startoff + ap->prevp->br_blockcount);
+ if (adjust &&
+ ISVALID(ap->rval + adjust, ap->prevp->br_startblock))
+ ap->rval += adjust;
+ }
+ /*
+ * If not at eof, then compare the two neighbor blocks.
+ * Figure out whether either one gives us a good starting point,
+ * and pick the better one.
+ */
+ else if (!ap->eof) {
+ xfs_fsblock_t gotbno; /* right side block number */
+ xfs_fsblock_t gotdiff=0; /* right side difference */
+ xfs_fsblock_t prevbno; /* left side block number */
+ xfs_fsblock_t prevdiff=0; /* left side difference */
+
+ /*
+ * If there's a previous (left) block, select a requested
+ * start block based on it.
+ */
+ if (ap->prevp->br_startoff != NULLFILEOFF &&
+ !isnullstartblock(ap->prevp->br_startblock) &&
+ (prevbno = ap->prevp->br_startblock +
+ ap->prevp->br_blockcount) &&
+ ISVALID(prevbno, ap->prevp->br_startblock)) {
+ /*
+ * Calculate gap to end of previous block.
+ */
+ adjust = prevdiff = ap->off -
+ (ap->prevp->br_startoff +
+ ap->prevp->br_blockcount);
+ /*
+ * Figure the startblock based on the previous block's
+ * end and the gap size.
+ * Heuristic!
+ * If the gap is large relative to the piece we're
+ * allocating, or using it gives us an invalid block
+ * number, then just use the end of the previous block.
+ */
+ if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+ ISVALID(prevbno + prevdiff,
+ ap->prevp->br_startblock))
+ prevbno += adjust;
+ else
+ prevdiff += adjust;
+ /*
+ * If the firstblock forbids it, can't use it,
+ * must use default.
+ */
+ if (!rt && !nullfb &&
+ XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
+ prevbno = NULLFSBLOCK;
+ }
+ /*
+ * No previous block or can't follow it, just default.
+ */
+ else
+ prevbno = NULLFSBLOCK;
+ /*
+ * If there's a following (right) block, select a requested
+ * start block based on it.
+ */
+ if (!isnullstartblock(ap->gotp->br_startblock)) {
+ /*
+ * Calculate gap to start of next block.
+ */
+ adjust = gotdiff = ap->gotp->br_startoff - ap->off;
+ /*
+ * Figure the startblock based on the next block's
+ * start and the gap size.
+ */
+ gotbno = ap->gotp->br_startblock;
+ /*
+ * Heuristic!
+ * If the gap is large relative to the piece we're
+ * allocating, or using it gives us an invalid block
+ * number, then just use the start of the next block
+ * offset by our length.
+ */
+ if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+ ISVALID(gotbno - gotdiff, gotbno))
+ gotbno -= adjust;
+ else if (ISVALID(gotbno - ap->alen, gotbno)) {
+ gotbno -= ap->alen;
+ gotdiff += adjust - ap->alen;
+ } else
+ gotdiff += adjust;
+ /*
+ * If the firstblock forbids it, can't use it,
+ * must use default.
+ */
+ if (!rt && !nullfb &&
+ XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
+ gotbno = NULLFSBLOCK;
+ }
+ /*
+ * No next block, just default.
+ */
+ else
+ gotbno = NULLFSBLOCK;
+ /*
+ * If both valid, pick the better one, else the only good
+ * one, else ap->rval is already set (to 0 or the inode block).
+ */
+ if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+ ap->rval = prevdiff <= gotdiff ? prevbno : gotbno;
+ else if (prevbno != NULLFSBLOCK)
+ ap->rval = prevbno;
+ else if (gotbno != NULLFSBLOCK)
+ ap->rval = gotbno;
+ }
+#undef ISVALID
+}
+
+STATIC int
+xfs_bmap_rtalloc(
+ xfs_bmalloca_t *ap) /* bmap alloc argument struct */
+{
+ xfs_alloctype_t atype = 0; /* type for allocation routines */
+ int error; /* error return value */
+ xfs_mount_t *mp; /* mount point structure */
+ xfs_extlen_t prod = 0; /* product factor for allocators */
+ xfs_extlen_t ralen = 0; /* realtime allocation length */
+ xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_rtblock_t rtb;
+
+ mp = ap->ip->i_mount;
+ align = xfs_get_extsz_hint(ap->ip);
+ prod = align / mp->m_sb.sb_rextsize;
+ error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+ align, 1, ap->eof, 0,
+ ap->conv, &ap->off, &ap->alen);
+ if (error)
+ return error;
+ ASSERT(ap->alen);
+ ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
+
+ /*
+ * If the offset & length are not perfectly aligned
+ * then kill prod, it will just get us in trouble.
+ */
+ if (do_mod(ap->off, align) || ap->alen % align)
+ prod = 1;
+ /*
+ * Set ralen to be the actual requested length in rtextents.
+ */
+ ralen = ap->alen / mp->m_sb.sb_rextsize;
+ /*
+ * If the old value was close enough to MAXEXTLEN that
+ * we rounded up to it, cut it back so it's valid again.
+ * Note that if it's a really large request (bigger than
+ * MAXEXTLEN), we don't hear about that number, and can't
+ * adjust the starting point to match it.
+ */
+ if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
+ ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+
+ /*
+ * Lock out other modifications to the RT bitmap inode.
+ */
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+ /*
+ * If it's an allocation to an empty file at offset 0,
+ * pick an extent that will space things out in the rt area.
+ */
+ if (ap->eof && ap->off == 0) {
+ xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+
+ error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+ if (error)
+ return error;
+ ap->rval = rtx * mp->m_sb.sb_rextsize;
+ } else {
+ ap->rval = 0;
+ }
+
+ xfs_bmap_adjacent(ap);
+
+ /*
+ * Realtime allocation, done through xfs_rtallocate_extent.
+ */
+ atype = ap->rval == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+ do_div(ap->rval, mp->m_sb.sb_rextsize);
+ rtb = ap->rval;
+ ap->alen = ralen;
+ if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen,
+ &ralen, atype, ap->wasdel, prod, &rtb)))
+ return error;
+ if (rtb == NULLFSBLOCK && prod > 1 &&
+ (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1,
+ ap->alen, &ralen, atype,
+ ap->wasdel, 1, &rtb)))
+ return error;
+ ap->rval = rtb;
+ if (ap->rval != NULLFSBLOCK) {
+ ap->rval *= mp->m_sb.sb_rextsize;
+ ralen *= mp->m_sb.sb_rextsize;
+ ap->alen = ralen;
+ ap->ip->i_d.di_nblocks += ralen;
+ xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+ if (ap->wasdel)
+ ap->ip->i_delayed_blks -= ralen;
+ /*
+ * Adjust the disk quota also. This was reserved
+ * earlier.
+ */
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+ ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
+ XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+ } else {
+ ap->alen = 0;
+ }
+ return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc_nullfb(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ struct xfs_perag *pag;
+ xfs_agnumber_t ag, startag;
+ int notinit = 0;
+ int error;
+
+ if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
+ else
+ args->type = XFS_ALLOCTYPE_START_BNO;
+ args->total = ap->total;
+
+ /*
+ * Search for an allocation group with a single extent large enough
+ * for the request. If one isn't found, then adjust the minimum
+ * allocation size to the largest space found.
+ */
+ startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ if (startag == NULLAGNUMBER)
+ startag = ag = 0;
+
+ pag = xfs_perag_get(mp, ag);
+ while (*blen < args->maxlen) {
+ if (!pag->pagf_init) {
+ error = xfs_alloc_pagf_init(mp, args->tp, ag,
+ XFS_ALLOC_FLAG_TRYLOCK);
+ if (error) {
+ xfs_perag_put(pag);
+ return error;
+ }
+ }
+
+ /*
+ * See xfs_alloc_fix_freelist...
+ */
+ if (pag->pagf_init) {
+ xfs_extlen_t longest;
+ longest = xfs_alloc_longest_free_extent(mp, pag);
+ if (*blen < longest)
+ *blen = longest;
+ } else
+ notinit = 1;
+
+ if (xfs_inode_is_filestream(ap->ip)) {
+ if (*blen >= args->maxlen)
+ break;
+
+ if (ap->userdata) {
+ /*
+ * If startag is an invalid AG, we've
+ * come here once before and
+ * xfs_filestream_new_ag picked the
+ * best currently available.
+ *
+ * Don't continue looping, since we
+ * could loop forever.
+ */
+ if (startag == NULLAGNUMBER)
+ break;
+
+ error = xfs_filestream_new_ag(ap, &ag);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ /* loop again to set 'blen'*/
+ startag = NULLAGNUMBER;
+ pag = xfs_perag_get(mp, ag);
+ continue;
+ }
+ }
+ if (++ag == mp->m_sb.sb_agcount)
+ ag = 0;
+ if (ag == startag)
+ break;
+ xfs_perag_put(pag);
+ pag = xfs_perag_get(mp, ag);
+ }
+ xfs_perag_put(pag);
+
+ /*
+ * Since the above loop did a BUF_TRYLOCK, it is
+ * possible that there is space for this request.
+ */
+ if (notinit || *blen < ap->minlen)
+ args->minlen = ap->minlen;
+ /*
+ * If the best seen length is less than the request
+ * length, use the best as the minimum.
+ */
+ else if (*blen < args->maxlen)
+ args->minlen = *blen;
+ /*
+ * Otherwise we've seen an extent as big as maxlen,
+ * use that as the minimum.
+ */
+ else
+ args->minlen = args->maxlen;
+
+ /*
+ * set the failure fallback case to look in the selected
+ * AG as the stream may have moved.
+ */
+ if (xfs_inode_is_filestream(ap->ip))
+ ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
+
+ return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc(
+ xfs_bmalloca_t *ap) /* bmap alloc argument struct */
+{
+ xfs_mount_t *mp; /* mount point structure */
+ xfs_alloctype_t atype = 0; /* type for allocation routines */
+ xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
+ xfs_agnumber_t ag;
+ xfs_alloc_arg_t args;
+ xfs_extlen_t blen;
+ xfs_extlen_t nextminlen = 0;
+ int nullfb; /* true if ap->firstblock isn't set */
+ int isaligned;
+ int tryagain;
+ int error;
+
+ mp = ap->ip->i_mount;
+ align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
+ if (unlikely(align)) {
+ error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp,
+ align, 0, ap->eof, 0, ap->conv,
+ &ap->off, &ap->alen);
+ ASSERT(!error);
+ ASSERT(ap->alen);
+ }
+ nullfb = ap->firstblock == NULLFSBLOCK;
+ fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+ if (nullfb) {
+ if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
+ ag = xfs_filestream_lookup_ag(ap->ip);
+ ag = (ag != NULLAGNUMBER) ? ag : 0;
+ ap->rval = XFS_AGB_TO_FSB(mp, ag, 0);
+ } else {
+ ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+ }
+ } else
+ ap->rval = ap->firstblock;
+
+ xfs_bmap_adjacent(ap);
+
+ /*
+ * If allowed, use ap->rval; otherwise must use firstblock since
+ * it's in the right allocation group.
+ */
+ if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
+ ;
+ else
+ ap->rval = ap->firstblock;
+ /*
+ * Normal allocation, done through xfs_alloc_vextent.
+ */
+ tryagain = isaligned = 0;
+ args.tp = ap->tp;
+ args.mp = mp;
+ args.fsbno = ap->rval;
+
+ /* Trim the allocation back to the maximum an AG can fit. */
+ args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
+ args.firstblock = ap->firstblock;
+ blen = 0;
+ if (nullfb) {
+ error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+ if (error)
+ return error;
+ } else if (ap->low) {
+ if (xfs_inode_is_filestream(ap->ip))
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ else
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.total = args.minlen = ap->minlen;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.total = ap->total;
+ args.minlen = ap->minlen;
+ }
+ /* apply extent size hints if obtained earlier */
+ if (unlikely(align)) {
+ args.prod = align;
+ if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
+ args.mod = (xfs_extlen_t)(args.prod - args.mod);
+ } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
+ args.prod = 1;
+ args.mod = 0;
+ } else {
+ args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
+ if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod))))
+ args.mod = (xfs_extlen_t)(args.prod - args.mod);
+ }
+ /*
+ * If we are not low on available data blocks, and the
+ * underlying logical volume manager is a stripe, and
+ * the file offset is zero then try to allocate data
+ * blocks on stripe unit boundary.
+ * NOTE: ap->aeof is only set if the allocation length
+ * is >= the stripe unit and the allocation offset is
+ * at the end of file.
+ */
+ if (!ap->low && ap->aeof) {
+ if (!ap->off) {
+ args.alignment = mp->m_dalign;
+ atype = args.type;
+ isaligned = 1;
+ /*
+ * Adjust for alignment
+ */
+ if (blen > args.alignment && blen <= args.maxlen)
+ args.minlen = blen - args.alignment;
+ args.minalignslop = 0;
+ } else {
+ /*
+ * First try an exact bno allocation.
+ * If it fails then do a near or start bno
+ * allocation with alignment turned on.
+ */
+ atype = args.type;
+ tryagain = 1;
+ args.type = XFS_ALLOCTYPE_THIS_BNO;
+ args.alignment = 1;
+ /*
+ * Compute the minlen+alignment for the
+ * next case. Set slop so that the value
+ * of minlen+alignment+slop doesn't go up
+ * between the calls.
+ */
+ if (blen > mp->m_dalign && blen <= args.maxlen)
+ nextminlen = blen - mp->m_dalign;
+ else
+ nextminlen = args.minlen;
+ if (nextminlen + mp->m_dalign > args.minlen + 1)
+ args.minalignslop =
+ nextminlen + mp->m_dalign -
+ args.minlen - 1;
+ else
+ args.minalignslop = 0;
+ }
+ } else {
+ args.alignment = 1;
+ args.minalignslop = 0;
+ }
+ args.minleft = ap->minleft;
+ args.wasdel = ap->wasdel;
+ args.isfl = 0;
+ args.userdata = ap->userdata;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ if (tryagain && args.fsbno == NULLFSBLOCK) {
+ /*
+ * Exact allocation failed. Now try with alignment
+ * turned on.
+ */
+ args.type = atype;
+ args.fsbno = ap->rval;
+ args.alignment = mp->m_dalign;
+ args.minlen = nextminlen;
+ args.minalignslop = 0;
+ isaligned = 1;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+ if (isaligned && args.fsbno == NULLFSBLOCK) {
+ /*
+ * allocation failed, so turn off alignment and
+ * try again.
+ */
+ args.type = atype;
+ args.fsbno = ap->rval;
+ args.alignment = 0;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+ if (args.fsbno == NULLFSBLOCK && nullfb &&
+ args.minlen > ap->minlen) {
+ args.minlen = ap->minlen;
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = ap->rval;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+ if (args.fsbno == NULLFSBLOCK && nullfb) {
+ args.fsbno = 0;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.total = ap->minlen;
+ args.minleft = 0;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ ap->low = 1;
+ }
+ if (args.fsbno != NULLFSBLOCK) {
+ ap->firstblock = ap->rval = args.fsbno;
+ ASSERT(nullfb || fb_agno == args.agno ||
+ (ap->low && fb_agno < args.agno));
+ ap->alen = args.len;
+ ap->ip->i_d.di_nblocks += args.len;
+ xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+ if (ap->wasdel)
+ ap->ip->i_delayed_blks -= args.len;
+ /*
+ * Adjust the disk quota also. This was reserved
+ * earlier.
+ */
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+ ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
+ XFS_TRANS_DQ_BCOUNT,
+ (long) args.len);
+ } else {
+ ap->rval = NULLFSBLOCK;
+ ap->alen = 0;
+ }
+ return 0;
+}
+
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int
+xfs_bmap_alloc(
+ xfs_bmalloca_t *ap) /* bmap alloc argument struct */
+{
+ if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
+ return xfs_bmap_rtalloc(ap);
+ return xfs_bmap_btalloc(ap);
+}
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int /* error */
+xfs_bmap_btree_to_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ /* REFERENCED */
+ struct xfs_btree_block *cblock;/* child btree block */
+ xfs_fsblock_t cbno; /* child block number */
+ xfs_buf_t *cbp; /* child block's buffer */
+ int error; /* error return value */
+ xfs_ifork_t *ifp; /* inode fork data */
+ xfs_mount_t *mp; /* mount point structure */
+ __be64 *pp; /* ptr to block address */
+ struct xfs_btree_block *rblock;/* root btree block */
+
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+ rblock = ifp->if_broot;
+ ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+ ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+ ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+ cbno = be64_to_cpu(*pp);
+ *logflagsp = 0;
+#ifdef DEBUG
+ if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
+ return error;
+#endif
+ if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
+ XFS_BMAP_BTREE_REF)))
+ return error;
+ cblock = XFS_BUF_TO_BLOCK(cbp);
+ if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+ return error;
+ xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+ ip->i_d.di_nblocks--;
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+ xfs_trans_binval(tp, cbp);
+ if (cur->bc_bufs[0] == cbp)
+ cur->bc_bufs[0] = NULL;
+ xfs_iroot_realloc(ip, -1, whichfork);
+ ASSERT(ifp->if_broot == NULL);
+ ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ return 0;
+}
+
+/*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int /* error */
+xfs_bmap_del_extent(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_trans_t *tp, /* current transaction pointer */
+ xfs_extnum_t *idx, /* extent number to update/delete */
+ xfs_bmap_free_t *flist, /* list of extents to be freed */
+ xfs_btree_cur_t *cur, /* if null, not a btree */
+ xfs_bmbt_irec_t *del, /* data to remove from extents */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
+ xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
+ xfs_fsblock_t del_endblock=0; /* first block past del */
+ xfs_fileoff_t del_endoff; /* first offset past del */
+ int delay; /* current block is delayed allocated */
+ int do_fx; /* free extent at end of routine */
+ xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */
+ int error; /* error return value */
+ int flags; /* inode logging flags */
+ xfs_bmbt_irec_t got; /* current extent entry */
+ xfs_fileoff_t got_endoff; /* first offset past got */
+ int i; /* temp state */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_filblks_t nblks; /* quota/sb block count */
+ xfs_bmbt_irec_t new; /* new record to be inserted */
+ /* REFERENCED */
+ uint qfield; /* quota field to update */
+ xfs_filblks_t temp; /* for indirect length calculations */
+ xfs_filblks_t temp2; /* for indirect length calculations */
+ int state = 0;
+
+ XFS_STATS_INC(xs_del_exlist);
+
+ if (whichfork == XFS_ATTR_FORK)
+ state |= BMAP_ATTRFORK;
+
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT(del->br_blockcount > 0);
+ ep = xfs_iext_get_ext(ifp, *idx);
+ xfs_bmbt_get_all(ep, &got);
+ ASSERT(got.br_startoff <= del->br_startoff);
+ del_endoff = del->br_startoff + del->br_blockcount;
+ got_endoff = got.br_startoff + got.br_blockcount;
+ ASSERT(got_endoff >= del_endoff);
+ delay = isnullstartblock(got.br_startblock);
+ ASSERT(isnullstartblock(del->br_startblock) == delay);
+ flags = 0;
+ qfield = 0;
+ error = 0;
+ /*
+ * If deleting a real allocation, must free up the disk space.
+ */
+ if (!delay) {
+ flags = XFS_ILOG_CORE;
+ /*
+ * Realtime allocation. Free it and record di_nblocks update.
+ */
+ if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+ xfs_fsblock_t bno;
+ xfs_filblks_t len;
+
+ ASSERT(do_mod(del->br_blockcount,
+ mp->m_sb.sb_rextsize) == 0);
+ ASSERT(do_mod(del->br_startblock,
+ mp->m_sb.sb_rextsize) == 0);
+ bno = del->br_startblock;
+ len = del->br_blockcount;
+ do_div(bno, mp->m_sb.sb_rextsize);
+ do_div(len, mp->m_sb.sb_rextsize);
+ if ((error = xfs_rtfree_extent(ip->i_transp, bno,
+ (xfs_extlen_t)len)))
+ goto done;
+ do_fx = 0;
+ nblks = len * mp->m_sb.sb_rextsize;
+ qfield = XFS_TRANS_DQ_RTBCOUNT;
+ }
+ /*
+ * Ordinary allocation.
+ */
+ else {
+ do_fx = 1;
+ nblks = del->br_blockcount;
+ qfield = XFS_TRANS_DQ_BCOUNT;
+ }
+ /*
+ * Set up del_endblock and cur for later.
+ */
+ del_endblock = del->br_startblock + del->br_blockcount;
+ if (cur) {
+ if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+ got.br_startblock, got.br_blockcount,
+ &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ }
+ da_old = da_new = 0;
+ } else {
+ da_old = startblockval(got.br_startblock);
+ da_new = 0;
+ nblks = 0;
+ do_fx = 0;
+ }
+ /*
+ * Set flag value to use in switch statement.
+ * Left-contig is 2, right-contig is 1.
+ */
+ switch (((got.br_startoff == del->br_startoff) << 1) |
+ (got_endoff == del_endoff)) {
+ case 3:
+ /*
+ * Matches the whole extent. Delete the entry.
+ */
+ xfs_iext_remove(ip, *idx, 1,
+ whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+ --*idx;
+ if (delay)
+ break;
+
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ flags |= XFS_ILOG_CORE;
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ if ((error = xfs_btree_delete(cur, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ break;
+
+ case 2:
+ /*
+ * Deleting the first part of the extent.
+ */
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_startoff(ep, del_endoff);
+ temp = got.br_blockcount - del->br_blockcount;
+ xfs_bmbt_set_blockcount(ep, temp);
+ if (delay) {
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_old);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ da_new = temp;
+ break;
+ }
+ xfs_bmbt_set_startblock(ep, del_endblock);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+ got.br_blockcount - del->br_blockcount,
+ got.br_state)))
+ goto done;
+ break;
+
+ case 1:
+ /*
+ * Deleting the last part of the extent.
+ */
+ temp = got.br_blockcount - del->br_blockcount;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp);
+ if (delay) {
+ temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ da_old);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ da_new = temp;
+ break;
+ }
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ if (!cur) {
+ flags |= xfs_ilog_fext(whichfork);
+ break;
+ }
+ if ((error = xfs_bmbt_update(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount - del->br_blockcount,
+ got.br_state)))
+ goto done;
+ break;
+
+ case 0:
+ /*
+ * Deleting the middle of the extent.
+ */
+ temp = del->br_startoff - got.br_startoff;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(ep, temp);
+ new.br_startoff = del_endoff;
+ temp2 = got_endoff - del_endoff;
+ new.br_blockcount = temp2;
+ new.br_state = got.br_state;
+ if (!delay) {
+ new.br_startblock = del_endblock;
+ flags |= XFS_ILOG_CORE;
+ if (cur) {
+ if ((error = xfs_bmbt_update(cur,
+ got.br_startoff,
+ got.br_startblock, temp,
+ got.br_state)))
+ goto done;
+ if ((error = xfs_btree_increment(cur, 0, &i)))
+ goto done;
+ cur->bc_rec.b = new;
+ error = xfs_btree_insert(cur, &i);
+ if (error && error != ENOSPC)
+ goto done;
+ /*
+ * If get no-space back from btree insert,
+ * it tried a split, and we have a zero
+ * block reservation.
+ * Fix up our state and return the error.
+ */
+ if (error == ENOSPC) {
+ /*
+ * Reset the cursor, don't trust
+ * it after any insert operation.
+ */
+ if ((error = xfs_bmbt_lookup_eq(cur,
+ got.br_startoff,
+ got.br_startblock,
+ temp, &i)))
+ goto done;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ /*
+ * Update the btree record back
+ * to the original value.
+ */
+ if ((error = xfs_bmbt_update(cur,
+ got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ got.br_state)))
+ goto done;
+ /*
+ * Reset the extent record back
+ * to the original value.
+ */
+ xfs_bmbt_set_blockcount(ep,
+ got.br_blockcount);
+ flags = 0;
+ error = XFS_ERROR(ENOSPC);
+ goto done;
+ }
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ } else
+ flags |= xfs_ilog_fext(whichfork);
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ } else {
+ ASSERT(whichfork == XFS_DATA_FORK);
+ temp = xfs_bmap_worst_indlen(ip, temp);
+ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+ temp2 = xfs_bmap_worst_indlen(ip, temp2);
+ new.br_startblock = nullstartblock((int)temp2);
+ da_new = temp + temp2;
+ while (da_new > da_old) {
+ if (temp) {
+ temp--;
+ da_new--;
+ xfs_bmbt_set_startblock(ep,
+ nullstartblock((int)temp));
+ }
+ if (da_new == da_old)
+ break;
+ if (temp2) {
+ temp2--;
+ da_new--;
+ new.br_startblock =
+ nullstartblock((int)temp2);
+ }
+ }
+ }
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+ ++*idx;
+ break;
+ }
+ /*
+ * If we need to, add to list of extents to delete.
+ */
+ if (do_fx)
+ xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+ mp);
+ /*
+ * Adjust inode # blocks in the file.
+ */
+ if (nblks)
+ ip->i_d.di_nblocks -= nblks;
+ /*
+ * Adjust quota data.
+ */
+ if (qfield)
+ xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+
+ /*
+ * Account for change in delayed indirect blocks.
+ * Nothing to do for disk quota accounting here.
+ */
+ ASSERT(da_old >= da_new);
+ if (da_old > da_new) {
+ xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ (int64_t)(da_old - da_new), 0);
+ }
+done:
+ *logflagsp = flags;
+ return error;
+}
+
+/*
+ * Remove the entry "free" from the free item list. Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+ xfs_bmap_free_t *flist, /* free item list header */
+ xfs_bmap_free_item_t *prev, /* previous item on list, if any */
+ xfs_bmap_free_item_t *free) /* list item to be freed */
+{
+ if (prev)
+ prev->xbfi_next = free->xbfi_next;
+ else
+ flist->xbf_first = free->xbfi_next;
+ flist->xbf_count--;
+ kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int /* error */
+xfs_bmap_extents_to_btree(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first-block-allocated */
+ xfs_bmap_free_t *flist, /* blocks freed in xaction */
+ xfs_btree_cur_t **curp, /* cursor returned to caller */
+ int wasdel, /* converting a delayed alloc */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *ablock; /* allocated (child) bt block */
+ xfs_buf_t *abp; /* buffer for ablock */
+ xfs_alloc_arg_t args; /* allocation arguments */
+ xfs_bmbt_rec_t *arp; /* child record pointer */
+ struct xfs_btree_block *block; /* btree root block */
+ xfs_btree_cur_t *cur; /* bmap btree cursor */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ int error; /* error return value */
+ xfs_extnum_t i, cnt; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_key_t *kp; /* root block key pointer */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_extnum_t nextents; /* number of file extents */
+ xfs_bmbt_ptr_t *pp; /* root block address pointer */
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+ ASSERT(ifp->if_ext_max ==
+ XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+ /*
+ * Make space in the inode incore.
+ */
+ xfs_iroot_realloc(ip, 1, whichfork);
+ ifp->if_flags |= XFS_IFBROOT;
+
+ /*
+ * Fill in the root.
+ */
+ block = ifp->if_broot;
+ block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
+ block->bb_level = cpu_to_be16(1);
+ block->bb_numrecs = cpu_to_be16(1);
+ block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+ block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+
+ /*
+ * Need a cursor. Can't allocate until bb_level is filled in.
+ */
+ mp = ip->i_mount;
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_private.b.firstblock = *firstblock;
+ cur->bc_private.b.flist = flist;
+ cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+ /*
+ * Convert to a btree with two levels, one record in root.
+ */
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+ args.tp = tp;
+ args.mp = mp;
+ args.firstblock = *firstblock;
+ if (*firstblock == NULLFSBLOCK) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+ } else if (flist->xbf_low) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = *firstblock;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.fsbno = *firstblock;
+ }
+ args.minlen = args.maxlen = args.prod = 1;
+ args.total = args.minleft = args.alignment = args.mod = args.isfl =
+ args.minalignslop = 0;
+ args.wasdel = wasdel;
+ *logflagsp = 0;
+ if ((error = xfs_alloc_vextent(&args))) {
+ xfs_iroot_realloc(ip, -1, whichfork);
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ /*
+ * Allocation can't fail, the space was reserved.
+ */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(*firstblock == NULLFSBLOCK ||
+ args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+ (flist->xbf_low &&
+ args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+ *firstblock = cur->bc_private.b.firstblock = args.fsbno;
+ cur->bc_private.b.allocated++;
+ ip->i_d.di_nblocks++;
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+ abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+ /*
+ * Fill in the child block.
+ */
+ ablock = XFS_BUF_TO_BLOCK(abp);
+ ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
+ ablock->bb_level = 0;
+ ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+ ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ for (cnt = i = 0; i < nextents; i++) {
+ ep = xfs_iext_get_ext(ifp, i);
+ if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
+ arp->l0 = cpu_to_be64(ep->l0);
+ arp->l1 = cpu_to_be64(ep->l1);
+ arp++; cnt++;
+ }
+ }
+ ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+ xfs_btree_set_numrecs(ablock, cnt);
+
+ /*
+ * Fill in the root key and pointer.
+ */
+ kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+ kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+ be16_to_cpu(block->bb_level)));
+ *pp = cpu_to_be64(args.fsbno);
+
+ /*
+ * Do all this logging at the end so that
+ * the root is at the right level.
+ */
+ xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+ xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+ ASSERT(*curp == NULL);
+ *curp = cur;
+ *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+ return 0;
+}
+
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ uint offset;
+
+ if (mp->m_sb.sb_inodesize == 256) {
+ offset = XFS_LITINO(mp) -
+ XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ } else {
+ offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+ }
+
+ ASSERT(offset < XFS_LITINO(mp));
+ return offset;
+}
+
+/*
+ * Helper routine to reset inode di_forkoff field when switching
+ * attribute fork from local to extent format - we reset it where
+ * possible to make space available for inline data fork extents.
+ */
+STATIC void
+xfs_bmap_forkoff_reset(
+ xfs_mount_t *mp,
+ xfs_inode_t *ip,
+ int whichfork)
+{
+ if (whichfork == XFS_ATTR_FORK &&
+ ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+ ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+ ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+ uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+ if (dfl_forkoff > ip->i_d.di_forkoff) {
+ ip->i_d.di_forkoff = dfl_forkoff;
+ ip->i_df.if_ext_max =
+ XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+ ip->i_afp->if_ext_max =
+ XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
+ }
+ }
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+STATIC int /* error */
+xfs_bmap_local_to_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fsblock_t *firstblock, /* first block allocated in xaction */
+ xfs_extlen_t total, /* total blocks needed by transaction */
+ int *logflagsp, /* inode logging flags */
+ int whichfork) /* data or attr fork */
+{
+ int error; /* error return value */
+ int flags; /* logging flags returned */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+
+ /*
+ * We don't want to deal with the case of keeping inode data inline yet.
+ * So sending the data fork of a regular inode is invalid.
+ */
+ ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG &&
+ whichfork == XFS_DATA_FORK));
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ flags = 0;
+ error = 0;
+ if (ifp->if_bytes) {
+ xfs_alloc_arg_t args; /* allocation arguments */
+ xfs_buf_t *bp; /* buffer for extent block */
+ xfs_bmbt_rec_host_t *ep;/* extent record pointer */
+
+ args.tp = tp;
+ args.mp = ip->i_mount;
+ args.firstblock = *firstblock;
+ ASSERT((ifp->if_flags &
+ (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
+ /*
+ * Allocate a block. We know we need only one, since the
+ * file currently fits in an inode.
+ */
+ if (*firstblock == NULLFSBLOCK) {
+ args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ } else {
+ args.fsbno = *firstblock;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ }
+ args.total = total;
+ args.mod = args.minleft = args.alignment = args.wasdel =
+ args.isfl = args.minalignslop = 0;
+ args.minlen = args.maxlen = args.prod = 1;
+ if ((error = xfs_alloc_vextent(&args)))
+ goto done;
+ /*
+ * Can't fail, the space was reserved.
+ */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(args.len == 1);
+ *firstblock = args.fsbno;
+ bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+ memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data,
+ ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
+ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+ xfs_iext_add(ifp, 0, 1);
+ ep = xfs_iext_get_ext(ifp, 0);
+ xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+ trace_xfs_bmap_post_update(ip, 0,
+ whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+ _THIS_IP_);
+ XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+ ip->i_d.di_nblocks = 1;
+ xfs_trans_mod_dquot_byino(tp, ip,
+ XFS_TRANS_DQ_BCOUNT, 1L);
+ flags |= xfs_ilog_fext(whichfork);
+ } else {
+ ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+ xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+ }
+ ifp->if_flags &= ~XFS_IFINLINE;
+ ifp->if_flags |= XFS_IFEXTENTS;
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ flags |= XFS_ILOG_CORE;
+done:
+ *logflagsp = flags;
+ return error;
+}
+
+/*
+ * Search the extent records for the entry containing block bno.
+ * If bno lies in a hole, point to the next entry. If bno lies
+ * past eof, *eofp will be set, and *prevp will contain the last
+ * entry (null if none). Else, *lastxp will be set to the index
+ * of the found entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmap_search_multi_extents(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fileoff_t bno, /* block number searched for */
+ int *eofp, /* out: end of file found */
+ xfs_extnum_t *lastxp, /* out: last extent index */
+ xfs_bmbt_irec_t *gotp, /* out: extent entry found */
+ xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
+{
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ xfs_extnum_t lastx; /* last extent index */
+
+ /*
+ * Initialize the extent entry structure to catch access to
+ * uninitialized br_startblock field.
+ */
+ gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
+ gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
+ gotp->br_state = XFS_EXT_INVALID;
+#if XFS_BIG_BLKNOS
+ gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
+#else
+ gotp->br_startblock = 0xffffa5a5;
+#endif
+ prevp->br_startoff = NULLFILEOFF;
+
+ ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
+ if (lastx > 0) {
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
+ }
+ if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+ xfs_bmbt_get_all(ep, gotp);
+ *eofp = 0;
+ } else {
+ if (lastx > 0) {
+ *gotp = *prevp;
+ }
+ *eofp = 1;
+ ep = NULL;
+ }
+ *lastxp = lastx;
+ return ep;
+}
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry. If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+xfs_bmap_search_extents(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fileoff_t bno, /* block number searched for */
+ int fork, /* data or attr fork */
+ int *eofp, /* out: end of file found */
+ xfs_extnum_t *lastxp, /* out: last extent index */
+ xfs_bmbt_irec_t *gotp, /* out: extent entry found */
+ xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
+{
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+
+ XFS_STATS_INC(xs_look_exlist);
+ ifp = XFS_IFORK_PTR(ip, fork);
+
+ ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
+
+ if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
+ !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
+ xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+ "Access to block zero in inode %llu "
+ "start_block: %llx start_off: %llx "
+ "blkcnt: %llx extent-state: %x lastx: %x\n",
+ (unsigned long long)ip->i_ino,
+ (unsigned long long)gotp->br_startblock,
+ (unsigned long long)gotp->br_startoff,
+ (unsigned long long)gotp->br_blockcount,
+ gotp->br_state, *lastxp);
+ *lastxp = NULLEXTNUM;
+ *eofp = 1;
+ return NULL;
+ }
+ return ep;
+}
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_filblks_t len) /* delayed extent length */
+{
+ int level; /* btree level number */
+ int maxrecs; /* maximum record count at this level */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_filblks_t rval; /* return value */
+
+ mp = ip->i_mount;
+ maxrecs = mp->m_bmap_dmxr[0];
+ for (level = 0, rval = 0;
+ level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+ level++) {
+ len += maxrecs - 1;
+ do_div(len, maxrecs);
+ rval += len;
+ if (len == 1)
+ return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+ level - 1;
+ if (level == 0)
+ maxrecs = mp->m_bmap_dmxr[1];
+ }
+ return rval;
+}
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int /* error code */
+xfs_bmap_add_attrfork(
+ xfs_inode_t *ip, /* incore inode pointer */
+ int size, /* space new attribute needs */
+ int rsvd) /* xact may use reserved blks */
+{
+ xfs_fsblock_t firstblock; /* 1st block/ag allocated */
+ xfs_bmap_free_t flist; /* freed extent records */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_trans_t *tp; /* transaction pointer */
+ int blks; /* space reservation */
+ int version = 1; /* superblock attr version */
+ int committed; /* xaction was committed */
+ int logflags; /* logging flags */
+ int error; /* error return value */
+
+ ASSERT(XFS_IFORK_Q(ip) == 0);
+ ASSERT(ip->i_df.if_ext_max ==
+ XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+
+ mp = ip->i_mount;
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+ blks = XFS_ADDAFORK_SPACE_RES(mp);
+ if (rsvd)
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
+ goto error0;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
+ XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+ XFS_QMOPT_RES_REGBLKS);
+ if (error) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+ return error;
+ }
+ if (XFS_IFORK_Q(ip))
+ goto error1;
+ if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+ /*
+ * For inodes coming from pre-6.2 filesystems.
+ */
+ ASSERT(ip->i_d.di_aformat == 0);
+ ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+ }
+ ASSERT(ip->i_d.di_anextents == 0);
+
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_DEV:
+ ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+ break;
+ case XFS_DINODE_FMT_UUID:
+ ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+ if (!ip->i_d.di_forkoff)
+ ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+ else if (mp->m_flags & XFS_MOUNT_ATTR2)
+ version = 2;
+ break;
+ default:
+ ASSERT(0);
+ error = XFS_ERROR(EINVAL);
+ goto error1;
+ }
+ ip->i_df.if_ext_max =
+ XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+ ASSERT(ip->i_afp == NULL);
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+ ip->i_afp->if_ext_max =
+ XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+ ip->i_afp->if_flags = XFS_IFEXTENTS;
+ logflags = 0;
+ xfs_bmap_init(&flist, &firstblock);
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+ &logflags);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+ &flist, &logflags);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+ &logflags);
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ if (error)
+ goto error2;
+ if (!xfs_sb_version_hasattr(&mp->m_sb) ||
+ (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+ __int64_t sbfields = 0;
+
+ spin_lock(&mp->m_sb_lock);
+ if (!xfs_sb_version_hasattr(&mp->m_sb)) {
+ xfs_sb_version_addattr(&mp->m_sb);
+ sbfields |= XFS_SB_VERSIONNUM;
+ }
+ if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
+ xfs_sb_version_addattr2(&mp->m_sb);
+ sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+ }
+ if (sbfields) {
+ spin_unlock(&mp->m_sb_lock);
+ xfs_mod_sb(tp, sbfields);
+ } else
+ spin_unlock(&mp->m_sb_lock);
+ }
+ if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
+ goto error2;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ ASSERT(ip->i_df.if_ext_max ==
+ XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+ return error;
+error2:
+ xfs_bmap_cancel(&flist);
+error1:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+error0:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ ASSERT(ip->i_df.if_ext_max ==
+ XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+ return error;
+}
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+/* ARGSUSED */
+void
+xfs_bmap_add_free(
+ xfs_fsblock_t bno, /* fs block number of extent */
+ xfs_filblks_t len, /* length of extent */
+ xfs_bmap_free_t *flist, /* list of extents */
+ xfs_mount_t *mp) /* mount point structure */
+{
+ xfs_bmap_free_item_t *cur; /* current (next) element */
+ xfs_bmap_free_item_t *new; /* new element */
+ xfs_bmap_free_item_t *prev; /* previous element */
+#ifdef DEBUG
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+
+ ASSERT(bno != NULLFSBLOCK);
+ ASSERT(len > 0);
+ ASSERT(len <= MAXEXTLEN);
+ ASSERT(!isnullstartblock(bno));
+ agno = XFS_FSB_TO_AGNO(mp, bno);
+ agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(agbno < mp->m_sb.sb_agblocks);
+ ASSERT(len < mp->m_sb.sb_agblocks);
+ ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+ ASSERT(xfs_bmap_free_item_zone != NULL);
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new->xbfi_startblock = bno;
+ new->xbfi_blockcount = (xfs_extlen_t)len;
+ for (prev = NULL, cur = flist->xbf_first;
+ cur != NULL;
+ prev = cur, cur = cur->xbfi_next) {
+ if (cur->xbfi_startblock >= bno)
+ break;
+ }
+ if (prev)
+ prev->xbfi_next = new;
+ else
+ flist->xbf_first = new;
+ new->xbfi_next = cur;
+ flist->xbf_count++;
+}
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem. Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+ xfs_mount_t *mp, /* file system mount structure */
+ int whichfork) /* data or attr fork */
+{
+ int level; /* btree level */
+ uint maxblocks; /* max blocks at this level */
+ uint maxleafents; /* max leaf entries possible */
+ int maxrootrecs; /* max records in root block */
+ int minleafrecs; /* min records in leaf block */
+ int minnoderecs; /* min records in node block */
+ int sz; /* root block size */
+
+ /*
+ * The maximum number of extents in a file, hence the maximum
+ * number of leaf entries, is controlled by the type of di_nextents
+ * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+ * (a signed 16-bit number, xfs_aextnum_t).
+ *
+ * Note that we can no longer assume that if we are in ATTR1 that
+ * the fork offset of all the inodes will be
+ * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+ * with ATTR2 and then mounted back with ATTR1, keeping the
+ * di_forkoff's fixed but probably at various positions. Therefore,
+ * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+ * of a minimum size available.
+ */
+ if (whichfork == XFS_DATA_FORK) {
+ maxleafents = MAXEXTNUM;
+ sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+ } else {
+ maxleafents = MAXAEXTNUM;
+ sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ }
+ maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
+ minleafrecs = mp->m_bmap_dmnr[0];
+ minnoderecs = mp->m_bmap_dmnr[1];
+ maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+ for (level = 1; maxblocks > 1; level++) {
+ if (maxblocks <= maxrootrecs)
+ maxblocks = 1;
+ else
+ maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+ }
+ mp->m_bm_maxlevels[whichfork] = level;
+}
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller. Frees all the extents that need freeing, which must be done
+ * last due to locking considerations. We never free any extents in
+ * the first transaction.
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+int /* error */
+xfs_bmap_finish(
+ xfs_trans_t **tp, /* transaction pointer addr */
+ xfs_bmap_free_t *flist, /* i/o: list extents to free */
+ int *committed) /* xact committed or not */
+{
+ xfs_efd_log_item_t *efd; /* extent free data */
+ xfs_efi_log_item_t *efi; /* extent free intention */
+ int error; /* error return value */
+ xfs_bmap_free_item_t *free; /* free extent item */
+ unsigned int logres; /* new log reservation */
+ unsigned int logcount; /* new log count */
+ xfs_mount_t *mp; /* filesystem mount structure */
+ xfs_bmap_free_item_t *next; /* next item on free list */
+ xfs_trans_t *ntp; /* new transaction pointer */
+
+ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+ if (flist->xbf_count == 0) {
+ *committed = 0;
+ return 0;
+ }
+ ntp = *tp;
+ efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+ for (free = flist->xbf_first; free; free = free->xbfi_next)
+ xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+ free->xbfi_blockcount);
+ logres = ntp->t_log_res;
+ logcount = ntp->t_log_count;
+ ntp = xfs_trans_dup(*tp);
+ error = xfs_trans_commit(*tp, 0);
+ *tp = ntp;
+ *committed = 1;
+ /*
+ * We have a new transaction, so we should return committed=1,
+ * even though we're returning an error.
+ */
+ if (error)
+ return error;
+
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(ntp->t_ticket);
+
+ if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
+ logcount)))
+ return error;
+ efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+ for (free = flist->xbf_first; free != NULL; free = next) {
+ next = free->xbfi_next;
+ if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+ free->xbfi_blockcount))) {
+ /*
+ * The bmap free list will be cleaned up at a
+ * higher level. The EFI will be canceled when
+ * this transaction is aborted.
+ * Need to force shutdown here to make sure it
+ * happens, since this transaction may not be
+ * dirty yet.
+ */
+ mp = ntp->t_mountp;
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xfs_force_shutdown(mp,
+ (error == EFSCORRUPTED) ?
+ SHUTDOWN_CORRUPT_INCORE :
+ SHUTDOWN_META_IO_ERROR);
+ return error;
+ }
+ xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+ free->xbfi_blockcount);
+ xfs_bmap_del_free(flist, NULL, free);
+ }
+ return 0;
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+ xfs_bmap_free_t *flist) /* list of bmap_free_items */
+{
+ xfs_bmap_free_item_t *free; /* free list item */
+ xfs_bmap_free_item_t *next;
+
+ if (flist->xbf_count == 0)
+ return;
+ ASSERT(flist->xbf_first != NULL);
+ for (free = flist->xbf_first; free; free = next) {
+ next = free->xbfi_next;
+ xfs_bmap_del_free(flist, NULL, free);
+ }
+ ASSERT(flist->xbf_count == 0);
+}
+
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int /* error */
+xfs_bmap_first_unused(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ xfs_extlen_t len, /* size of hole to find */
+ xfs_fileoff_t *first_unused, /* unused block */
+ int whichfork) /* data or attr fork */
+{
+ int error; /* error return value */
+ int idx; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_fileoff_t lastaddr; /* last block number seen */
+ xfs_fileoff_t lowest; /* lowest useful block */
+ xfs_fileoff_t max; /* starting useful block */
+ xfs_fileoff_t off; /* offset for this block */
+ xfs_extnum_t nextents; /* number of extent entries */
+
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ *first_unused = 0;
+ return 0;
+ }
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+ lowest = *first_unused;
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+ off = xfs_bmbt_get_startoff(ep);
+ /*
+ * See if the hole before this extent will work.
+ */
+ if (off >= lowest + len && off - max >= len) {
+ *first_unused = max;
+ return 0;
+ }
+ lastaddr = off + xfs_bmbt_get_blockcount(ep);
+ max = XFS_FILEOFF_MAX(lastaddr, lowest);
+ }
+ *first_unused = max;
+ return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int /* error */
+xfs_bmap_last_before(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ xfs_fileoff_t *last_block, /* last block */
+ int whichfork) /* data or attr fork */
+{
+ xfs_fileoff_t bno; /* input file offset */
+ int eof; /* hit end of file */
+ xfs_bmbt_rec_host_t *ep; /* pointer to last extent */
+ int error; /* error return value */
+ xfs_bmbt_irec_t got; /* current extent value */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extnum_t lastx; /* last extent used */
+ xfs_bmbt_irec_t prev; /* previous extent value */
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+ return XFS_ERROR(EIO);
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ *last_block = 0;
+ return 0;
+ }
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+ bno = *last_block - 1;
+ ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+ &prev);
+ if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+ if (prev.br_startoff == NULLFILEOFF)
+ *last_block = 0;
+ else
+ *last_block = prev.br_startoff + prev.br_blockcount;
+ }
+ /*
+ * Otherwise *last_block is already the right answer.
+ */
+ return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file. This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int /* error */
+xfs_bmap_last_offset(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ xfs_fileoff_t *last_block, /* last block */
+ int whichfork) /* data or attr fork */
+{
+ xfs_bmbt_rec_host_t *ep; /* pointer to last extent */
+ int error; /* error return value */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extnum_t nextents; /* number of extent entries */
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+ return XFS_ERROR(EIO);
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ *last_block = 0;
+ return 0;
+ }
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ if (!nextents) {
+ *last_block = 0;
+ return 0;
+ }
+ ep = xfs_iext_get_ext(ifp, nextents - 1);
+ *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
+ return 0;
+}
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not. For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int /* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork) /* data or attr fork */
+{
+ xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ int rval; /* return value */
+ xfs_bmbt_irec_t s; /* internal version of extent */
+
+#ifndef DEBUG
+ if (whichfork == XFS_DATA_FORK) {
+ return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ?
+ (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
+ (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
+ }
+#endif /* !DEBUG */
+ if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+ return 0;
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ return 0;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ep = xfs_iext_get_ext(ifp, 0);
+ xfs_bmbt_get_all(ep, &s);
+ rval = s.br_startoff == 0 && s.br_blockcount == 1;
+ if (rval && whichfork == XFS_DATA_FORK)
+ ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
+ return rval;
+}
+
+STATIC int
+xfs_bmap_sanity_check(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ int level)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+
+ if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC ||
+ be16_to_cpu(block->bb_level) != level ||
+ be16_to_cpu(block->bb_numrecs) == 0 ||
+ be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+ return 0;
+ return 1;
+}
+
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int /* error */
+xfs_bmap_read_extents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_buf_t *bp; /* buffer for "block" */
+ int error; /* error return value */
+ xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
+ xfs_extnum_t i, j; /* index into the extents list */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+ /* REFERENCED */
+ xfs_extnum_t room; /* number of entries there's room for */
+
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+ XFS_EXTFMT_INODE(ip);
+ block = ifp->if_broot;
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+ /*
+ * Go down the tree until leaf level is reached, following the first
+ * pointer (leftmost) at each level.
+ */
+ while (level-- > 0) {
+ if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF)))
+ return error;
+ block = XFS_BUF_TO_BLOCK(bp);
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, level),
+ error0);
+ if (level == 0)
+ break;
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ xfs_trans_brelse(tp, bp);
+ }
+ /*
+ * Here with bp and block set to the leftmost leaf node in the tree.
+ */
+ room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ i = 0;
+ /*
+ * Loop over all leaf nodes. Copy information to the extent records.
+ */
+ for (;;) {
+ xfs_bmbt_rec_t *frp;
+ xfs_fsblock_t nextbno;
+ xfs_extnum_t num_recs;
+ xfs_extnum_t start;
+
+
+ num_recs = xfs_btree_get_numrecs(block);
+ if (unlikely(i + num_recs > room)) {
+ ASSERT(i + num_recs <= room);
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %Lu, (btree extents).",
+ (unsigned long long) ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+ XFS_ERRLEVEL_LOW, ip->i_mount, block);
+ goto error0;
+ }
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, 0),
+ error0);
+ /*
+ * Read-ahead the next leaf block, if any.
+ */
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ if (nextbno != NULLFSBLOCK)
+ xfs_btree_reada_bufl(mp, nextbno, 1);
+ /*
+ * Copy records into the extent records.
+ */
+ frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+ start = i;
+ for (j = 0; j < num_recs; j++, i++, frp++) {
+ xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
+ trp->l0 = be64_to_cpu(frp->l0);
+ trp->l1 = be64_to_cpu(frp->l1);
+ }
+ if (exntf == XFS_EXTFMT_NOSTATE) {
+ /*
+ * Check all attribute bmap btree records and
+ * any "older" data bmap btree records for a
+ * set bit in the "extent flag" position.
+ */
+ if (unlikely(xfs_check_nostate_extents(ifp,
+ start, num_recs))) {
+ XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ goto error0;
+ }
+ }
+ xfs_trans_brelse(tp, bp);
+ bno = nextbno;
+ /*
+ * If we've reached the end, stop.
+ */
+ if (bno == NULLFSBLOCK)
+ break;
+ if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF)))
+ return error;
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+ ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+ XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+ return 0;
+error0:
+ xfs_trans_brelse(tp, bp);
+ return XFS_ERROR(EFSCORRUPTED);
+}
+
+#ifdef DEBUG
+/*
+ * Add bmap trace insert entries for all the contents of the extent records.
+ */
+void
+xfs_bmap_trace_exlist(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t cnt, /* count of entries in the list */
+ int whichfork, /* data or attr fork */
+ unsigned long caller_ip)
+{
+ xfs_extnum_t idx; /* extent record index */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ int state = 0;
+
+ if (whichfork == XFS_ATTR_FORK)
+ state |= BMAP_ATTRFORK;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+ for (idx = 0; idx < cnt; idx++)
+ trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+}
+
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the callers original parameters. Specifically check the
+ * ranges of the returned irecs to ensure that they only extent beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ int flags,
+ xfs_bmbt_irec_t *mval,
+ int nmap,
+ int ret_nmap)
+{
+ int i; /* index to map values */
+
+ ASSERT(ret_nmap <= nmap);
+
+ for (i = 0; i < ret_nmap; i++) {
+ ASSERT(mval[i].br_blockcount > 0);
+ if (!(flags & XFS_BMAPI_ENTIRE)) {
+ ASSERT(mval[i].br_startoff >= bno);
+ ASSERT(mval[i].br_blockcount <= len);
+ ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+ bno + len);
+ } else {
+ ASSERT(mval[i].br_startoff < bno + len);
+ ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+ bno);
+ }
+ ASSERT(i == 0 ||
+ mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+ mval[i].br_startoff);
+ if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY))
+ ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+ mval[i].br_startblock != HOLESTARTBLOCK);
+ ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+ mval[i].br_state == XFS_EXT_UNWRITTEN);
+ }
+}
+#endif /* DEBUG */
+
+
+/*
+ * Map file blocks to filesystem blocks.
+ * File range is given by the bno/len pair.
+ * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
+ * into a hole or past eof.
+ * Only allocates blocks from a single allocation group,
+ * to avoid locking problems.
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int /* error */
+xfs_bmapi(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ xfs_fileoff_t bno, /* starting file offs. mapped */
+ xfs_filblks_t len, /* length to map in file */
+ int flags, /* XFS_BMAPI_... */
+ xfs_fsblock_t *firstblock, /* first allocated block
+ controls a.g. for allocs */
+ xfs_extlen_t total, /* total blocks needed */
+ xfs_bmbt_irec_t *mval, /* output: map values */
+ int *nmap, /* i/o: mval size/count */
+ xfs_bmap_free_t *flist) /* i/o: list extents to free */
+{
+ xfs_fsblock_t abno; /* allocated block number */
+ xfs_extlen_t alen; /* allocated extent length */
+ xfs_fileoff_t aoff; /* allocated file offset */
+ xfs_bmalloca_t bma = { 0 }; /* args for xfs_bmap_alloc */
+ xfs_btree_cur_t *cur; /* bmap btree cursor */
+ xfs_fileoff_t end; /* end of mapped file region */
+ int eof; /* we've hit the end of extents */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ int error; /* error return */
+ xfs_bmbt_irec_t got; /* current file extent record */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extlen_t indlen; /* indirect blocks length */
+ xfs_extnum_t lastx; /* last useful extent number */
+ int logflags; /* flags for transaction logging */
+ xfs_extlen_t minleft; /* min blocks left after allocation */
+ xfs_extlen_t minlen; /* min allocation size */
+ xfs_mount_t *mp; /* xfs mount structure */
+ int n; /* current extent index */
+ int nallocs; /* number of extents alloc'd */
+ xfs_extnum_t nextents; /* number of extents in file */
+ xfs_fileoff_t obno; /* old block number (offset) */
+ xfs_bmbt_irec_t prev; /* previous file extent record */
+ int tmp_logflags; /* temp flags holder */
+ int whichfork; /* data or attr fork */
+ char inhole; /* current location is hole in file */
+ char wasdelay; /* old extent was delayed */
+ char wr; /* this is a write request */
+ char rt; /* this is a realtime file */
+#ifdef DEBUG
+ xfs_fileoff_t orig_bno; /* original block number value */
+ int orig_flags; /* original flags arg value */
+ xfs_filblks_t orig_len; /* original value of len arg */
+ xfs_bmbt_irec_t *orig_mval; /* original value of mval */
+ int orig_nmap; /* original value of *nmap */
+
+ orig_bno = bno;
+ orig_len = len;
+ orig_flags = flags;
+ orig_mval = mval;
+ orig_nmap = *nmap;
+#endif
+ ASSERT(*nmap >= 1);
+ ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE));
+ whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ mp = ip->i_mount;
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+ rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(ifp->if_ext_max ==
+ XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+ if ((wr = (flags & XFS_BMAPI_WRITE)) != 0)
+ XFS_STATS_INC(xs_blk_mapw);
+ else
+ XFS_STATS_INC(xs_blk_mapr);
+ /*
+ * IGSTATE flag is used to combine extents which
+ * differ only due to the state of the extents.
+ * This technique is used from xfs_getbmap()
+ * when the caller does not wish to see the
+ * separation (which is the default).
+ *
+ * This technique is also used when writing a
+ * buffer which has been partially written,
+ * (usually by being flushed during a chunkread),
+ * to ensure one write takes place. This also
+ * prevents a change in the xfs inode extents at
+ * this time, intentionally. This change occurs
+ * on completion of the write operation, in
+ * xfs_strat_comp(), where the xfs_bmapi() call
+ * is transactioned, and the extents combined.
+ */
+ if ((flags & XFS_BMAPI_IGSTATE) && wr) /* if writing unwritten space */
+ wr = 0; /* no allocations are allowed */
+ ASSERT(wr || !(flags & XFS_BMAPI_DELAY));
+ logflags = 0;
+ nallocs = 0;
+ cur = NULL;
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ ASSERT(wr && tp);
+ if ((error = xfs_bmap_local_to_extents(tp, ip,
+ firstblock, total, &logflags, whichfork)))
+ goto error0;
+ }
+ if (wr && *firstblock == NULLFSBLOCK) {
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
+ minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ else
+ minleft = 1;
+ } else
+ minleft = 0;
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ goto error0;
+ ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+ &prev);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ n = 0;
+ end = bno + len;
+ obno = bno;
+ bma.ip = NULL;
+
+ while (bno < end && n < *nmap) {
+ /*
+ * Reading past eof, act as though there's a hole
+ * up to end.
+ */
+ if (eof && !wr)
+ got.br_startoff = end;
+ inhole = eof || got.br_startoff > bno;
+ wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
+ isnullstartblock(got.br_startblock);
+ /*
+ * First, deal with the hole before the allocated space
+ * that we found, if any.
+ */
+ if (wr && (inhole || wasdelay)) {
+ /*
+ * For the wasdelay case, we could also just
+ * allocate the stuff asked for in this bmap call
+ * but that wouldn't be as good.
+ */
+ if (wasdelay) {
+ alen = (xfs_extlen_t)got.br_blockcount;
+ aoff = got.br_startoff;
+ if (lastx != NULLEXTNUM && lastx) {
+ ep = xfs_iext_get_ext(ifp, lastx - 1);
+ xfs_bmbt_get_all(ep, &prev);
+ }
+ } else {
+ alen = (xfs_extlen_t)
+ XFS_FILBLKS_MIN(len, MAXEXTLEN);
+ if (!eof)
+ alen = (xfs_extlen_t)
+ XFS_FILBLKS_MIN(alen,
+ got.br_startoff - bno);
+ aoff = bno;
+ }
+ minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1;
+ if (flags & XFS_BMAPI_DELAY) {
+ xfs_extlen_t extsz;
+
+ /* Figure out the extent size, adjust alen */
+ extsz = xfs_get_extsz_hint(ip);
+ if (extsz) {
+ /*
+ * make sure we don't exceed a single
+ * extent length when we align the
+ * extent by reducing length we are
+ * going to allocate by the maximum
+ * amount extent size aligment may
+ * require.
+ */
+ alen = XFS_FILBLKS_MIN(len,
+ MAXEXTLEN - (2 * extsz - 1));
+ error = xfs_bmap_extsize_align(mp,
+ &got, &prev, extsz,
+ rt, eof,
+ flags&XFS_BMAPI_DELAY,
+ flags&XFS_BMAPI_CONVERT,
+ &aoff, &alen);
+ ASSERT(!error);
+ }
+
+ if (rt)
+ extsz = alen / mp->m_sb.sb_rextsize;
+
+ /*
+ * Make a transaction-less quota reservation for
+ * delayed allocation blocks. This number gets
+ * adjusted later. We return if we haven't
+ * allocated blocks already inside this loop.
+ */
+ error = xfs_trans_reserve_quota_nblks(
+ NULL, ip, (long)alen, 0,
+ rt ? XFS_QMOPT_RES_RTBLKS :
+ XFS_QMOPT_RES_REGBLKS);
+ if (error) {
+ if (n == 0) {
+ *nmap = 0;
+ ASSERT(cur == NULL);
+ return error;
+ }
+ break;
+ }
+
+ /*
+ * Split changing sb for alen and indlen since
+ * they could be coming from different places.
+ */
+ indlen = (xfs_extlen_t)
+ xfs_bmap_worst_indlen(ip, alen);
+ ASSERT(indlen > 0);
+
+ if (rt) {
+ error = xfs_mod_incore_sb(mp,
+ XFS_SBS_FREXTENTS,
+ -((int64_t)extsz), 0);
+ } else {
+ error = xfs_icsb_modify_counters(mp,
+ XFS_SBS_FDBLOCKS,
+ -((int64_t)alen), 0);
+ }
+ if (!error) {
+ error = xfs_icsb_modify_counters(mp,
+ XFS_SBS_FDBLOCKS,
+ -((int64_t)indlen), 0);
+ if (error && rt)
+ xfs_mod_incore_sb(mp,
+ XFS_SBS_FREXTENTS,
+ (int64_t)extsz, 0);
+ else if (error)
+ xfs_icsb_modify_counters(mp,
+ XFS_SBS_FDBLOCKS,
+ (int64_t)alen, 0);
+ }
+
+ if (error) {
+ if (XFS_IS_QUOTA_ON(mp))
+ /* unreserve the blocks now */
+ (void)
+ xfs_trans_unreserve_quota_nblks(
+ NULL, ip,
+ (long)alen, 0, rt ?
+ XFS_QMOPT_RES_RTBLKS :
+ XFS_QMOPT_RES_REGBLKS);
+ break;
+ }
+
+ ip->i_delayed_blks += alen;
+ abno = nullstartblock(indlen);
+ } else {
+ /*
+ * If first time, allocate and fill in
+ * once-only bma fields.
+ */
+ if (bma.ip == NULL) {
+ bma.tp = tp;
+ bma.ip = ip;
+ bma.prevp = &prev;
+ bma.gotp = &got;
+ bma.total = total;
+ bma.userdata = 0;
+ }
+ /* Indicate if this is the first user data
+ * in the file, or just any user data.
+ */
+ if (!(flags & XFS_BMAPI_METADATA)) {
+ bma.userdata = (aoff == 0) ?
+ XFS_ALLOC_INITIAL_USER_DATA :
+ XFS_ALLOC_USERDATA;
+ }
+ /*
+ * Fill in changeable bma fields.
+ */
+ bma.eof = eof;
+ bma.firstblock = *firstblock;
+ bma.alen = alen;
+ bma.off = aoff;
+ bma.conv = !!(flags & XFS_BMAPI_CONVERT);
+ bma.wasdel = wasdelay;
+ bma.minlen = minlen;
+ bma.low = flist->xbf_low;
+ bma.minleft = minleft;
+ /*
+ * Only want to do the alignment at the
+ * eof if it is userdata and allocation length
+ * is larger than a stripe unit.
+ */
+ if (mp->m_dalign && alen >= mp->m_dalign &&
+ (!(flags & XFS_BMAPI_METADATA)) &&
+ (whichfork == XFS_DATA_FORK)) {
+ if ((error = xfs_bmap_isaeof(ip, aoff,
+ whichfork, &bma.aeof)))
+ goto error0;
+ } else
+ bma.aeof = 0;
+ /*
+ * Call allocator.
+ */
+ if ((error = xfs_bmap_alloc(&bma)))
+ goto error0;
+ /*
+ * Copy out result fields.
+ */
+ abno = bma.rval;
+ if ((flist->xbf_low = bma.low))
+ minleft = 0;
+ alen = bma.alen;
+ aoff = bma.off;
+ ASSERT(*firstblock == NULLFSBLOCK ||
+ XFS_FSB_TO_AGNO(mp, *firstblock) ==
+ XFS_FSB_TO_AGNO(mp, bma.firstblock) ||
+ (flist->xbf_low &&
+ XFS_FSB_TO_AGNO(mp, *firstblock) <
+ XFS_FSB_TO_AGNO(mp, bma.firstblock)));
+ *firstblock = bma.firstblock;
+ if (cur)
+ cur->bc_private.b.firstblock =
+ *firstblock;
+ if (abno == NULLFSBLOCK)
+ break;
+ if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
+ cur = xfs_bmbt_init_cursor(mp, tp,
+ ip, whichfork);
+ cur->bc_private.b.firstblock =
+ *firstblock;
+ cur->bc_private.b.flist = flist;
+ }
+ /*
+ * Bump the number of extents we've allocated
+ * in this call.
+ */
+ nallocs++;
+ }
+ if (cur)
+ cur->bc_private.b.flags =
+ wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
+ got.br_startoff = aoff;
+ got.br_startblock = abno;
+ got.br_blockcount = alen;
+ got.br_state = XFS_EXT_NORM; /* assume normal */
+ /*
+ * Determine state of extent, and the filesystem.
+ * A wasdelay extent has been initialized, so
+ * shouldn't be flagged as unwritten.
+ */
+ if (wr && xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
+ got.br_state = XFS_EXT_UNWRITTEN;
+ }
+ error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
+ firstblock, flist, &tmp_logflags,
+ whichfork);
+ logflags |= tmp_logflags;
+ if (error)
+ goto error0;
+ ep = xfs_iext_get_ext(ifp, lastx);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ xfs_bmbt_get_all(ep, &got);
+ ASSERT(got.br_startoff <= aoff);
+ ASSERT(got.br_startoff + got.br_blockcount >=
+ aoff + alen);
+#ifdef DEBUG
+ if (flags & XFS_BMAPI_DELAY) {
+ ASSERT(isnullstartblock(got.br_startblock));
+ ASSERT(startblockval(got.br_startblock) > 0);
+ }
+ ASSERT(got.br_state == XFS_EXT_NORM ||
+ got.br_state == XFS_EXT_UNWRITTEN);
+#endif
+ /*
+ * Fall down into the found allocated space case.
+ */
+ } else if (inhole) {
+ /*
+ * Reading in a hole.
+ */
+ mval->br_startoff = bno;
+ mval->br_startblock = HOLESTARTBLOCK;
+ mval->br_blockcount =
+ XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+ mval->br_state = XFS_EXT_NORM;
+ bno += mval->br_blockcount;
+ len -= mval->br_blockcount;
+ mval++;
+ n++;
+ continue;
+ }
+ /*
+ * Then deal with the allocated space we found.
+ */
+ ASSERT(ep != NULL);
+ if (!(flags & XFS_BMAPI_ENTIRE) &&
+ (got.br_startoff + got.br_blockcount > obno)) {
+ if (obno > bno)
+ bno = obno;
+ ASSERT((bno >= obno) || (n == 0));
+ ASSERT(bno < end);
+ mval->br_startoff = bno;
+ if (isnullstartblock(got.br_startblock)) {
+ ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
+ mval->br_startblock = DELAYSTARTBLOCK;
+ } else
+ mval->br_startblock =
+ got.br_startblock +
+ (bno - got.br_startoff);
+ /*
+ * Return the minimum of what we got and what we
+ * asked for for the length. We can use the len
+ * variable here because it is modified below
+ * and we could have been there before coming
+ * here if the first part of the allocation
+ * didn't overlap what was asked for.
+ */
+ mval->br_blockcount =
+ XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
+ (bno - got.br_startoff));
+ mval->br_state = got.br_state;
+ ASSERT(mval->br_blockcount <= len);
+ } else {
+ *mval = got;
+ if (isnullstartblock(mval->br_startblock)) {
+ ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
+ mval->br_startblock = DELAYSTARTBLOCK;
+ }
+ }
+
+ /*
+ * Check if writing previously allocated but
+ * unwritten extents.
+ */
+ if (wr &&
+ ((mval->br_state == XFS_EXT_UNWRITTEN &&
+ ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) ||
+ (mval->br_state == XFS_EXT_NORM &&
+ ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT)) ==
+ (XFS_BMAPI_PREALLOC|XFS_BMAPI_CONVERT))))) {
+ /*
+ * Modify (by adding) the state flag, if writing.
+ */
+ ASSERT(mval->br_blockcount <= len);
+ if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
+ cur = xfs_bmbt_init_cursor(mp,
+ tp, ip, whichfork);
+ cur->bc_private.b.firstblock =
+ *firstblock;
+ cur->bc_private.b.flist = flist;
+ }
+ mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
+ ? XFS_EXT_NORM
+ : XFS_EXT_UNWRITTEN;
+ error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
+ firstblock, flist, &tmp_logflags,
+ whichfork);
+ logflags |= tmp_logflags;
+ if (error)
+ goto error0;
+ ep = xfs_iext_get_ext(ifp, lastx);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ xfs_bmbt_get_all(ep, &got);
+ /*
+ * We may have combined previously unwritten
+ * space with written space, so generate
+ * another request.
+ */
+ if (mval->br_blockcount < len)
+ continue;
+ }
+
+ ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+ ((mval->br_startoff + mval->br_blockcount) <= end));
+ ASSERT((flags & XFS_BMAPI_ENTIRE) ||
+ (mval->br_blockcount <= len) ||
+ (mval->br_startoff < obno));
+ bno = mval->br_startoff + mval->br_blockcount;
+ len = end - bno;
+ if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+ ASSERT(mval->br_startblock == mval[-1].br_startblock);
+ ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+ ASSERT(mval->br_state == mval[-1].br_state);
+ mval[-1].br_blockcount = mval->br_blockcount;
+ mval[-1].br_state = mval->br_state;
+ } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+ mval[-1].br_startblock != DELAYSTARTBLOCK &&
+ mval[-1].br_startblock != HOLESTARTBLOCK &&
+ mval->br_startblock ==
+ mval[-1].br_startblock + mval[-1].br_blockcount &&
+ ((flags & XFS_BMAPI_IGSTATE) ||
+ mval[-1].br_state == mval->br_state)) {
+ ASSERT(mval->br_startoff ==
+ mval[-1].br_startoff + mval[-1].br_blockcount);
+ mval[-1].br_blockcount += mval->br_blockcount;
+ } else if (n > 0 &&
+ mval->br_startblock == DELAYSTARTBLOCK &&
+ mval[-1].br_startblock == DELAYSTARTBLOCK &&
+ mval->br_startoff ==
+ mval[-1].br_startoff + mval[-1].br_blockcount) {
+ mval[-1].br_blockcount += mval->br_blockcount;
+ mval[-1].br_state = mval->br_state;
+ } else if (!((n == 0) &&
+ ((mval->br_startoff + mval->br_blockcount) <=
+ obno))) {
+ mval++;
+ n++;
+ }
+ /*
+ * If we're done, stop now. Stop when we've allocated
+ * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise
+ * the transaction may get too big.
+ */
+ if (bno >= end || n >= *nmap || nallocs >= *nmap)
+ break;
+ /*
+ * Else go on to the next record.
+ */
+ prev = got;
+ if (++lastx < nextents) {
+ ep = xfs_iext_get_ext(ifp, lastx);
+ xfs_bmbt_get_all(ep, &got);
+ } else {
+ eof = 1;
+ }
+ }
+ *nmap = n;
+ /*
+ * Transform from btree to extents, give it cur.
+ */
+ if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+ ASSERT(wr && cur);
+ error = xfs_bmap_btree_to_extents(tp, ip, cur,
+ &tmp_logflags, whichfork);
+ logflags |= tmp_logflags;
+ if (error)
+ goto error0;
+ }
+ ASSERT(ifp->if_ext_max ==
+ XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
+ XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+ error = 0;
+error0:
+ /*
+ * Log everything. Do this after conversion, there's no point in
+ * logging the extent records if we've converted to btree format.
+ */
+ if ((logflags & xfs_ilog_fext(whichfork)) &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ logflags &= ~xfs_ilog_fbroot(whichfork);
+ /*
+ * Log whatever the flags say, even if error. Otherwise we might miss
+ * detecting a case where the data is changed, there's an error,
+ * and it's not logged so we don't shutdown when we should.
+ */
+ if (logflags) {
+ ASSERT(tp && wr);
+ xfs_trans_log_inode(tp, ip, logflags);
+ }
+ if (cur) {
+ if (!error) {
+ ASSERT(*firstblock == NULLFSBLOCK ||
+ XFS_FSB_TO_AGNO(mp, *firstblock) ==
+ XFS_FSB_TO_AGNO(mp,
+ cur->bc_private.b.firstblock) ||
+ (flist->xbf_low &&
+ XFS_FSB_TO_AGNO(mp, *firstblock) <
+ XFS_FSB_TO_AGNO(mp,
+ cur->bc_private.b.firstblock)));
+ *firstblock = cur->bc_private.b.firstblock;
+ }
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ }
+ if (!error)
+ xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+ orig_nmap, *nmap);
+ return error;
+}
+
+/*
+ * Map file blocks to filesystem blocks, simple version.
+ * One block (extent) only, read-only.
+ * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
+ * For the other flag values, the effect is as if XFS_BMAPI_METADATA
+ * was set and all the others were clear.
+ */
+int /* error */
+xfs_bmapi_single(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork, /* data or attr fork */
+ xfs_fsblock_t *fsb, /* output: mapped block */
+ xfs_fileoff_t bno) /* starting file offs. mapped */
+{
+ int eof; /* we've hit the end of extents */
+ int error; /* error return */
+ xfs_bmbt_irec_t got; /* current file extent record */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extnum_t lastx; /* last useful extent number */
+ xfs_bmbt_irec_t prev; /* previous file extent record */
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (unlikely(
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) {
+ XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return XFS_ERROR(EIO);
+ XFS_STATS_INC(xs_blk_mapr);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+ (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+ &prev);
+ /*
+ * Reading past eof, act as though there's a hole
+ * up to end.
+ */
+ if (eof || got.br_startoff > bno) {
+ *fsb = NULLFSBLOCK;
+ return 0;
+ }
+ ASSERT(!isnullstartblock(got.br_startblock));
+ ASSERT(bno < got.br_startoff + got.br_blockcount);
+ *fsb = got.br_startblock + (bno - got.br_startoff);
+ return 0;
+}
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value. If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int /* error */
+xfs_bunmapi(
+ xfs_trans_t *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t bno, /* starting offset to unmap */
+ xfs_filblks_t len, /* length to unmap in file */
+ int flags, /* misc flags */
+ xfs_extnum_t nexts, /* number of extents max */
+ xfs_fsblock_t *firstblock, /* first allocated block
+ controls a.g. for allocs */
+ xfs_bmap_free_t *flist, /* i/o: list extents to free */
+ int *done) /* set if not done yet */
+{
+ xfs_btree_cur_t *cur; /* bmap btree cursor */
+ xfs_bmbt_irec_t del; /* extent being deleted */
+ int eof; /* is deleting at eof */
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ int error; /* error return value */
+ xfs_extnum_t extno; /* extent number in list */
+ xfs_bmbt_irec_t got; /* current extent record */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ int isrt; /* freeing in rt area */
+ xfs_extnum_t lastx; /* last extent index used */
+ int logflags; /* transaction logging flags */
+ xfs_extlen_t mod; /* rt extent offset */
+ xfs_mount_t *mp; /* mount structure */
+ xfs_extnum_t nextents; /* number of file extents */
+ xfs_bmbt_irec_t prev; /* previous extent record */
+ xfs_fileoff_t start; /* first file offset deleted */
+ int tmp_logflags; /* partial logging flags */
+ int wasdel; /* was a delayed alloc extent */
+ int whichfork; /* data or attribute fork */
+ xfs_fsblock_t sum;
+
+ trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
+
+ whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (unlikely(
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+ XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ mp = ip->i_mount;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ ASSERT(len > 0);
+ ASSERT(nexts >= 0);
+ ASSERT(ifp->if_ext_max ==
+ XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(tp, ip, whichfork)))
+ return error;
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ if (nextents == 0) {
+ *done = 1;
+ return 0;
+ }
+ XFS_STATS_INC(xs_blk_unmap);
+ isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+ start = bno;
+ bno = start + len - 1;
+ ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+ &prev);
+
+ /*
+ * Check to see if the given block number is past the end of the
+ * file, back up to the last block if so...
+ */
+ if (eof) {
+ ep = xfs_iext_get_ext(ifp, --lastx);
+ xfs_bmbt_get_all(ep, &got);
+ bno = got.br_startoff + got.br_blockcount - 1;
+ }
+ logflags = 0;
+ if (ifp->if_flags & XFS_IFBROOT) {
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_private.b.firstblock = *firstblock;
+ cur->bc_private.b.flist = flist;
+ cur->bc_private.b.flags = 0;
+ } else
+ cur = NULL;
+ extno = 0;
+ while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+ (nexts == 0 || extno < nexts)) {
+ /*
+ * Is the found extent after a hole in which bno lives?
+ * Just back up to the previous extent, if so.
+ */
+ if (got.br_startoff > bno) {
+ if (--lastx < 0)
+ break;
+ ep = xfs_iext_get_ext(ifp, lastx);
+ xfs_bmbt_get_all(ep, &got);
+ }
+ /*
+ * Is the last block of this extent before the range
+ * we're supposed to delete? If so, we're done.
+ */
+ bno = XFS_FILEOFF_MIN(bno,
+ got.br_startoff + got.br_blockcount - 1);
+ if (bno < start)
+ break;
+ /*
+ * Then deal with the (possibly delayed) allocated space
+ * we found.
+ */
+ ASSERT(ep != NULL);
+ del = got;
+ wasdel = isnullstartblock(del.br_startblock);
+ if (got.br_startoff < start) {
+ del.br_startoff = start;
+ del.br_blockcount -= start - got.br_startoff;
+ if (!wasdel)
+ del.br_startblock += start - got.br_startoff;
+ }
+ if (del.br_startoff + del.br_blockcount > bno + 1)
+ del.br_blockcount = bno + 1 - del.br_startoff;
+ sum = del.br_startblock + del.br_blockcount;
+ if (isrt &&
+ (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+ /*
+ * Realtime extent not lined up at the end.
+ * The extent could have been split into written
+ * and unwritten pieces, or we could just be
+ * unmapping part of it. But we can't really
+ * get rid of part of a realtime extent.
+ */
+ if (del.br_state == XFS_EXT_UNWRITTEN ||
+ !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ /*
+ * This piece is unwritten, or we're not
+ * using unwritten extents. Skip over it.
+ */
+ ASSERT(bno >= mod);
+ bno -= mod > del.br_blockcount ?
+ del.br_blockcount : mod;
+ if (bno < got.br_startoff) {
+ if (--lastx >= 0)
+ xfs_bmbt_get_all(xfs_iext_get_ext(
+ ifp, lastx), &got);
+ }
+ continue;
+ }
+ /*
+ * It's written, turn it unwritten.
+ * This is better than zeroing it.
+ */
+ ASSERT(del.br_state == XFS_EXT_NORM);
+ ASSERT(xfs_trans_get_block_res(tp) > 0);
+ /*
+ * If this spans a realtime extent boundary,
+ * chop it back to the start of the one we end at.
+ */
+ if (del.br_blockcount > mod) {
+ del.br_startoff += del.br_blockcount - mod;
+ del.br_startblock += del.br_blockcount - mod;
+ del.br_blockcount = mod;
+ }
+ del.br_state = XFS_EXT_UNWRITTEN;
+ error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
+ firstblock, flist, &logflags,
+ XFS_DATA_FORK);
+ if (error)
+ goto error0;
+ goto nodelete;
+ }
+ if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
+ /*
+ * Realtime extent is lined up at the end but not
+ * at the front. We'll get rid of full extents if
+ * we can.
+ */
+ mod = mp->m_sb.sb_rextsize - mod;
+ if (del.br_blockcount > mod) {
+ del.br_blockcount -= mod;
+ del.br_startoff += mod;
+ del.br_startblock += mod;
+ } else if ((del.br_startoff == start &&
+ (del.br_state == XFS_EXT_UNWRITTEN ||
+ xfs_trans_get_block_res(tp) == 0)) ||
+ !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ /*
+ * Can't make it unwritten. There isn't
+ * a full extent here so just skip it.
+ */
+ ASSERT(bno >= del.br_blockcount);
+ bno -= del.br_blockcount;
+ if (got.br_startoff > bno) {
+ if (--lastx >= 0) {
+ ep = xfs_iext_get_ext(ifp,
+ lastx);
+ xfs_bmbt_get_all(ep, &got);
+ }
+ }
+ continue;
+ } else if (del.br_state == XFS_EXT_UNWRITTEN) {
+ /*
+ * This one is already unwritten.
+ * It must have a written left neighbor.
+ * Unwrite the killed part of that one and
+ * try again.
+ */
+ ASSERT(lastx > 0);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+ lastx - 1), &prev);
+ ASSERT(prev.br_state == XFS_EXT_NORM);
+ ASSERT(!isnullstartblock(prev.br_startblock));
+ ASSERT(del.br_startblock ==
+ prev.br_startblock + prev.br_blockcount);
+ if (prev.br_startoff < start) {
+ mod = start - prev.br_startoff;
+ prev.br_blockcount -= mod;
+ prev.br_startblock += mod;
+ prev.br_startoff = start;
+ }
+ prev.br_state = XFS_EXT_UNWRITTEN;
+ lastx--;
+ error = xfs_bmap_add_extent(ip, &lastx, &cur,
+ &prev, firstblock, flist, &logflags,
+ XFS_DATA_FORK);
+ if (error)
+ goto error0;
+ goto nodelete;
+ } else {
+ ASSERT(del.br_state == XFS_EXT_NORM);
+ del.br_state = XFS_EXT_UNWRITTEN;
+ error = xfs_bmap_add_extent(ip, &lastx, &cur,
+ &del, firstblock, flist, &logflags,
+ XFS_DATA_FORK);
+ if (error)
+ goto error0;
+ goto nodelete;
+ }
+ }
+ if (wasdel) {
+ ASSERT(startblockval(del.br_startblock) > 0);
+ /* Update realtime/data freespace, unreserve quota */
+ if (isrt) {
+ xfs_filblks_t rtexts;
+
+ rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
+ do_div(rtexts, mp->m_sb.sb_rextsize);
+ xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+ (int64_t)rtexts, 0);
+ (void)xfs_trans_reserve_quota_nblks(NULL,
+ ip, -((long)del.br_blockcount), 0,
+ XFS_QMOPT_RES_RTBLKS);
+ } else {
+ xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ (int64_t)del.br_blockcount, 0);
+ (void)xfs_trans_reserve_quota_nblks(NULL,
+ ip, -((long)del.br_blockcount), 0,
+ XFS_QMOPT_RES_REGBLKS);
+ }
+ ip->i_delayed_blks -= del.br_blockcount;
+ if (cur)
+ cur->bc_private.b.flags |=
+ XFS_BTCUR_BPRV_WASDEL;
+ } else if (cur)
+ cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
+ /*
+ * If it's the case where the directory code is running
+ * with no block reservation, and the deleted block is in
+ * the middle of its extent, and the resulting insert
+ * of an extent would cause transformation to btree format,
+ * then reject it. The calling code will then swap
+ * blocks around instead.
+ * We have to do this now, rather than waiting for the
+ * conversion to btree format, since the transaction
+ * will be dirty.
+ */
+ if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+ del.br_startoff > got.br_startoff &&
+ del.br_startoff + del.br_blockcount <
+ got.br_startoff + got.br_blockcount) {
+ error = XFS_ERROR(ENOSPC);
+ goto error0;
+ }
+ error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+ &tmp_logflags, whichfork);
+ logflags |= tmp_logflags;
+ if (error)
+ goto error0;
+ bno = del.br_startoff - 1;
+nodelete:
+ /*
+ * If not done go on to the next (previous) record.
+ */
+ if (bno != (xfs_fileoff_t)-1 && bno >= start) {
+ if (lastx >= 0) {
+ ep = xfs_iext_get_ext(ifp, lastx);
+ if (xfs_bmbt_get_startoff(ep) > bno) {
+ if (--lastx >= 0)
+ ep = xfs_iext_get_ext(ifp,
+ lastx);
+ }
+ xfs_bmbt_get_all(ep, &got);
+ }
+ extno++;
+ }
+ }
+ *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+ ASSERT(ifp->if_ext_max ==
+ XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+ /*
+ * Convert to a btree if necessary.
+ */
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+ &cur, 0, &tmp_logflags, whichfork);
+ logflags |= tmp_logflags;
+ if (error)
+ goto error0;
+ }
+ /*
+ * transform from btree to extents, give it cur
+ */
+ else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+ ASSERT(cur != NULL);
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+ whichfork);
+ logflags |= tmp_logflags;
+ if (error)
+ goto error0;
+ }
+ /*
+ * transform from extents to local?
+ */
+ ASSERT(ifp->if_ext_max ==
+ XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+ error = 0;
+error0:
+ /*
+ * Log everything. Do this after conversion, there's no point in
+ * logging the extent records if we've converted to btree format.
+ */
+ if ((logflags & xfs_ilog_fext(whichfork)) &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ logflags &= ~xfs_ilog_fbroot(whichfork);
+ /*
+ * Log inode even in the error case, if the transaction
+ * is dirty we'll need to shut down the filesystem.
+ */
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ if (cur) {
+ if (!error) {
+ *firstblock = cur->bc_private.b.firstblock;
+ cur->bc_private.b.allocated = 0;
+ }
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ }
+ return error;
+}
+
+/*
+ * returns 1 for success, 0 if we failed to map the extent.
+ */
+STATIC int
+xfs_getbmapx_fix_eof_hole(
+ xfs_inode_t *ip, /* xfs incore inode pointer */
+ struct getbmapx *out, /* output structure */
+ int prealloced, /* this is a file with
+ * preallocated data space */
+ __int64_t end, /* last block requested */
+ xfs_fsblock_t startblock)
+{
+ __int64_t fixlen;
+ xfs_mount_t *mp; /* file system mount point */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extnum_t lastx; /* last extent pointer */
+ xfs_fileoff_t fileblock;
+
+ if (startblock == HOLESTARTBLOCK) {
+ mp = ip->i_mount;
+ out->bmv_block = -1;
+ fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
+ fixlen -= out->bmv_offset;
+ if (prealloced && out->bmv_offset + out->bmv_length == end) {
+ /* Came to hole at EOF. Trim it. */
+ if (fixlen <= 0)
+ return 0;
+ out->bmv_length = fixlen;
+ }
+ } else {
+ if (startblock == DELAYSTARTBLOCK)
+ out->bmv_block = -2;
+ else
+ out->bmv_block = xfs_fsb_to_db(ip, startblock);
+ fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+ (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+ out->bmv_oflags |= BMV_OF_LAST;
+ }
+
+ return 1;
+}
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
+ */
+int /* error code */
+xfs_getbmap(
+ xfs_inode_t *ip,
+ struct getbmapx *bmv, /* user bmap structure */
+ xfs_bmap_format_t formatter, /* format to user */
+ void *arg) /* formatter arg */
+{
+ __int64_t bmvend; /* last block requested */
+ int error = 0; /* return value */
+ __int64_t fixlen; /* length for -1 case */
+ int i; /* extent number */
+ int lock; /* lock state */
+ xfs_bmbt_irec_t *map; /* buffer for user's data */
+ xfs_mount_t *mp; /* file system mount point */
+ int nex; /* # of user extents can do */
+ int nexleft; /* # of user extents left */
+ int subnex; /* # of bmapi's can do */
+ int nmap; /* number of map entries */
+ struct getbmapx *out; /* output structure */
+ int whichfork; /* data or attr fork */
+ int prealloced; /* this is a file with
+ * preallocated data space */
+ int iflags; /* interface flags */
+ int bmapi_flags; /* flags for xfs_bmapi */
+ int cur_ext = 0;
+
+ mp = ip->i_mount;
+ iflags = bmv->bmv_iflags;
+ whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
+
+ if (whichfork == XFS_ATTR_FORK) {
+ if (XFS_IFORK_Q(ip)) {
+ if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
+ ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
+ return XFS_ERROR(EINVAL);
+ } else if (unlikely(
+ ip->i_d.di_aformat != 0 &&
+ ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
+ XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ prealloced = 0;
+ fixlen = 1LL << 32;
+ } else {
+ if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+ ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+ return XFS_ERROR(EINVAL);
+
+ if (xfs_get_extsz_hint(ip) ||
+ ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
+ prealloced = 1;
+ fixlen = XFS_MAXIOFFSET(mp);
+ } else {
+ prealloced = 0;
+ fixlen = ip->i_size;
+ }
+ }
+
+ if (bmv->bmv_length == -1) {
+ fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
+ bmv->bmv_length =
+ max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
+ } else if (bmv->bmv_length == 0) {
+ bmv->bmv_entries = 0;
+ return 0;
+ } else if (bmv->bmv_length < 0) {
+ return XFS_ERROR(EINVAL);
+ }
+
+ nex = bmv->bmv_count - 1;
+ if (nex <= 0)
+ return XFS_ERROR(EINVAL);
+ bmvend = bmv->bmv_offset + bmv->bmv_length;
+
+
+ if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
+ return XFS_ERROR(ENOMEM);
+ out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
+ if (!out)
+ return XFS_ERROR(ENOMEM);
+
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
+ if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+ error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
+ if (error)
+ goto out_unlock_iolock;
+ }
+ /*
+ * even after flushing the inode, there can still be delalloc
+ * blocks on the inode beyond EOF due to speculative
+ * preallocation. These are not removed until the release
+ * function is called or the inode is inactivated. Hence we
+ * cannot assert here that ip->i_delayed_blks == 0.
+ */
+ }
+
+ lock = xfs_ilock_map_shared(ip);
+
+ /*
+ * Don't let nex be bigger than the number of extents
+ * we can have assuming alternating holes and real extents.
+ */
+ if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
+ nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
+
+ bmapi_flags = xfs_bmapi_aflag(whichfork);
+ if (!(iflags & BMV_IF_PREALLOC))
+ bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+ /*
+ * Allocate enough space to handle "subnex" maps at a time.
+ */
+ error = ENOMEM;
+ subnex = 16;
+ map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
+ if (!map)
+ goto out_unlock_ilock;
+
+ bmv->bmv_entries = 0;
+
+ if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
+ (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
+ error = 0;
+ goto out_free_map;
+ }
+
+ nexleft = nex;
+
+ do {
+ nmap = (nexleft > subnex) ? subnex : nexleft;
+ error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+ XFS_BB_TO_FSB(mp, bmv->bmv_length),
+ bmapi_flags, NULL, 0, map, &nmap,
+ NULL);
+ if (error)
+ goto out_free_map;
+ ASSERT(nmap <= subnex);
+
+ for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+ out[cur_ext].bmv_oflags = 0;
+ if (map[i].br_state == XFS_EXT_UNWRITTEN)
+ out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
+ else if (map[i].br_startblock == DELAYSTARTBLOCK)
+ out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
+ out[cur_ext].bmv_offset =
+ XFS_FSB_TO_BB(mp, map[i].br_startoff);
+ out[cur_ext].bmv_length =
+ XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+ out[cur_ext].bmv_unused1 = 0;
+ out[cur_ext].bmv_unused2 = 0;
+ ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
+ (map[i].br_startblock != DELAYSTARTBLOCK));
+ if (map[i].br_startblock == HOLESTARTBLOCK &&
+ whichfork == XFS_ATTR_FORK) {
+ /* came to the end of attribute fork */
+ out[cur_ext].bmv_oflags |= BMV_OF_LAST;
+ goto out_free_map;
+ }
+
+ if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
+ prealloced, bmvend,
+ map[i].br_startblock))
+ goto out_free_map;
+
+ bmv->bmv_offset =
+ out[cur_ext].bmv_offset +
+ out[cur_ext].bmv_length;
+ bmv->bmv_length =
+ max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+
+ /*
+ * In case we don't want to return the hole,
+ * don't increase cur_ext so that we can reuse
+ * it in the next loop.
+ */
+ if ((iflags & BMV_IF_NO_HOLES) &&
+ map[i].br_startblock == HOLESTARTBLOCK) {
+ memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+ continue;
+ }
+
+ nexleft--;
+ bmv->bmv_entries++;
+ cur_ext++;
+ }
+ } while (nmap && nexleft && bmv->bmv_length);
+
+ out_free_map:
+ kmem_free(map);
+ out_unlock_ilock:
+ xfs_iunlock_map_shared(ip, lock);
+ out_unlock_iolock:
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ for (i = 0; i < cur_ext; i++) {
+ int full = 0; /* user array is full */
+
+ /* format results & advance arg */
+ error = formatter(&arg, &out[i], &full);
+ if (error || full)
+ break;
+ }
+
+ kmem_free(out);
+ return error;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ */
+STATIC int /* error */
+xfs_bmap_isaeof(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fileoff_t off, /* file offset in fsblocks */
+ int whichfork, /* data or attribute fork */
+ char *aeof) /* return value */
+{
+ int error; /* error return value */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */
+ xfs_extnum_t nextents; /* number of file extents */
+ xfs_bmbt_irec_t s; /* expanded extent record */
+
+ ASSERT(whichfork == XFS_DATA_FORK);
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(NULL, ip, whichfork)))
+ return error;
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ if (nextents == 0) {
+ *aeof = 1;
+ return 0;
+ }
+ /*
+ * Go to the last extent
+ */
+ lastrec = xfs_iext_get_ext(ifp, nextents - 1);
+ xfs_bmbt_get_all(lastrec, &s);
+ /*
+ * Check we are allocating in the last extent (for delayed allocations)
+ * or past the last extent for non-delayed allocations.
+ */
+ *aeof = (off >= s.br_startoff &&
+ off < s.br_startoff + s.br_blockcount &&
+ isnullstartblock(s.br_startblock)) ||
+ off >= s.br_startoff + s.br_blockcount;
+ return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.
+ */
+int /* error */
+xfs_bmap_eof(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_fileoff_t endoff, /* file offset in fsblocks */
+ int whichfork, /* data or attribute fork */
+ int *eof) /* result value */
+{
+ xfs_fsblock_t blockcount; /* extent block count */
+ int error; /* error return value */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_bmbt_rec_host_t *lastrec; /* extent record pointer */
+ xfs_extnum_t nextents; /* number of file extents */
+ xfs_fileoff_t startoff; /* extent starting file offset */
+
+ ASSERT(whichfork == XFS_DATA_FORK);
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+ (error = xfs_iread_extents(NULL, ip, whichfork)))
+ return error;
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ if (nextents == 0) {
+ *eof = 1;
+ return 0;
+ }
+ /*
+ * Go to the last extent
+ */
+ lastrec = xfs_iext_get_ext(ifp, nextents - 1);
+ startoff = xfs_bmbt_get_startoff(lastrec);
+ blockcount = xfs_bmbt_get_blockcount(lastrec);
+ *eof = endoff >= startoff + blockcount;
+ return 0;
+}
+
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+ struct xfs_btree_cur *cur,
+ xfs_fsblock_t bno)
+{
+ struct xfs_log_item_desc *lidp;
+ int i;
+
+ if (!cur)
+ return NULL;
+
+ for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+ if (!cur->bc_bufs[i])
+ break;
+ if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+ return cur->bc_bufs[i];
+ }
+
+ /* Chase down all the log items to see if the bp is there */
+ list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+ struct xfs_buf_log_item *bip;
+ bip = (struct xfs_buf_log_item *)lidp->lid_item;
+ if (bip->bli_item.li_type == XFS_LI_BUF &&
+ XFS_BUF_ADDR(bip->bli_buf) == bno)
+ return bip->bli_buf;
+ }
+
+ return NULL;
+}
+
+STATIC void
+xfs_check_block(
+ struct xfs_btree_block *block,
+ xfs_mount_t *mp,
+ int root,
+ short sz)
+{
+ int i, j, dmxr;
+ __be64 *pp, *thispa; /* pointer to block address */
+ xfs_bmbt_key_t *prevp, *keyp;
+
+ ASSERT(be16_to_cpu(block->bb_level) > 0);
+
+ prevp = NULL;
+ for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+ dmxr = mp->m_bmap_dmxr[0];
+ keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+
+ if (prevp) {
+ ASSERT(be64_to_cpu(prevp->br_startoff) <
+ be64_to_cpu(keyp->br_startoff));
+ }
+ prevp = keyp;
+
+ /*
+ * Compare the block numbers to see if there are dups.
+ */
+ if (root)
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+ else
+ pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+
+ for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+ if (root)
+ thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+ else
+ thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+ if (*thispa == *pp) {
+ xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+ __func__, j, i,
+ (unsigned long long)be64_to_cpu(*thispa));
+ panic("%s: ptrs are equal in node\n",
+ __func__);
+ }
+ }
+ }
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+
+STATIC void
+xfs_bmap_check_leaf_extents(
+ xfs_btree_cur_t *cur, /* btree cursor or null */
+ xfs_inode_t *ip, /* incore inode pointer */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_buf_t *bp; /* buffer for "block" */
+ int error; /* error return value */
+ xfs_extnum_t i=0, j; /* index into the extents list */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+ xfs_bmbt_rec_t *ep; /* pointer to current extent */
+ xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
+ xfs_bmbt_rec_t *nextp; /* pointer to next extent */
+ int bp_release = 0;
+
+ if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+ return;
+ }
+
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ block = ifp->if_broot;
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+ /*
+ * Go down the tree until leaf level is reached, following the first
+ * pointer (leftmost) at each level.
+ */
+ while (level-- > 0) {
+ /* See if buf is in cur first */
+ bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+ if (bp) {
+ bp_release = 0;
+ } else {
+ bp_release = 1;
+ }
+ if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF)))
+ goto error_norelse;
+ block = XFS_BUF_TO_BLOCK(bp);
+ XFS_WANT_CORRUPTED_GOTO(
+ xfs_bmap_sanity_check(mp, bp, level),
+ error0);
+ if (level == 0)
+ break;
+
+ /*
+ * Check this block for basic sanity (increasing keys and
+ * no duplicate blocks).
+ */
+
+ xfs_check_block(block, mp, 0, 0);
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ }
+
+ /*
+ * Here with bp and block set to the leftmost leaf node in the tree.
+ */
+ i = 0;
+
+ /*
+ * Loop over all leaf nodes checking that all extents are in the right order.
+ */
+ for (;;) {
+ xfs_fsblock_t nextbno;
+ xfs_extnum_t num_recs;
+
+
+ num_recs = xfs_btree_get_numrecs(block);
+
+ /*
+ * Read-ahead the next leaf block, if any.
+ */
+
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+ /*
+ * Check all the extents to make sure they are OK.
+ * If we had a previous block, the last entry should
+ * conform with the first entry in this one.
+ */
+
+ ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+ if (i) {
+ ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+ xfs_bmbt_disk_get_blockcount(&last) <=
+ xfs_bmbt_disk_get_startoff(ep));
+ }
+ for (j = 1; j < num_recs; j++) {
+ nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+ ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+ xfs_bmbt_disk_get_blockcount(ep) <=
+ xfs_bmbt_disk_get_startoff(nextp));
+ ep = nextp;
+ }
+
+ last = *ep;
+ i += num_recs;
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ bno = nextbno;
+ /*
+ * If we've reached the end, stop.
+ */
+ if (bno == NULLFSBLOCK)
+ break;
+
+ bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+ if (bp) {
+ bp_release = 0;
+ } else {
+ bp_release = 1;
+ }
+ if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF)))
+ goto error_norelse;
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+ if (bp_release) {
+ bp_release = 0;
+ xfs_trans_brelse(NULL, bp);
+ }
+ return;
+
+error0:
+ xfs_warn(mp, "%s: at error0", __func__);
+ if (bp_release)
+ xfs_trans_brelse(NULL, bp);
+error_norelse:
+ xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+ __func__, i);
+ panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+ return;
+}
+#endif
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int /* error */
+xfs_bmap_count_blocks(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_inode_t *ip, /* incore inode */
+ int whichfork, /* data or attr fork */
+ int *count) /* out: count of blocks */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ xfs_fsblock_t bno; /* block # of "block" */
+ xfs_ifork_t *ifp; /* fork structure */
+ int level; /* btree level, for checking */
+ xfs_mount_t *mp; /* file system mount structure */
+ __be64 *pp; /* pointer to block address */
+
+ bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+ xfs_bmap_count_leaves(ifp, 0,
+ ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+ count);
+ return 0;
+ }
+
+ /*
+ * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ */
+ block = ifp->if_broot;
+ level = be16_to_cpu(block->bb_level);
+ ASSERT(level > 0);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+ ASSERT(bno != NULLDFSBNO);
+ ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+ ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+ if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
+ XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
+ mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ return 0;
+}
+
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks is use.
+ */
+STATIC int /* error */
+xfs_bmap_count_tree(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fsblock_t blockno, /* file system block number */
+ int levelin, /* level in btree */
+ int *count) /* Count of blocks */
+{
+ int error;
+ xfs_buf_t *bp, *nbp;
+ int level = levelin;
+ __be64 *pp;
+ xfs_fsblock_t bno = blockno;
+ xfs_fsblock_t nextbno;
+ struct xfs_btree_block *block, *nextblock;
+ int numrecs;
+
+ if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
+ return error;
+ *count += 1;
+ block = XFS_BUF_TO_BLOCK(bp);
+
+ if (--level) {
+ /* Not at node above leaves, count this level of nodes */
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ while (nextbno != NULLFSBLOCK) {
+ if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
+ 0, &nbp, XFS_BMAP_BTREE_REF)))
+ return error;
+ *count += 1;
+ nextblock = XFS_BUF_TO_BLOCK(nbp);
+ nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
+ xfs_trans_brelse(tp, nbp);
+ }
+
+ /* Dive to the next level */
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ bno = be64_to_cpu(*pp);
+ if (unlikely((error =
+ xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
+ xfs_trans_brelse(tp, bp);
+ XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
+ XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ xfs_trans_brelse(tp, bp);
+ } else {
+ /* count all level 1 nodes and their leaves */
+ for (;;) {
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+ numrecs = be16_to_cpu(block->bb_numrecs);
+ xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
+ xfs_trans_brelse(tp, bp);
+ if (nextbno == NULLFSBLOCK)
+ break;
+ bno = nextbno;
+ if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF)))
+ return error;
+ *count += 1;
+ block = XFS_BUF_TO_BLOCK(bp);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Count leaf blocks given a range of extent records.
+ */
+STATIC void
+xfs_bmap_count_leaves(
+ xfs_ifork_t *ifp,
+ xfs_extnum_t idx,
+ int numrecs,
+ int *count)
+{
+ int b;
+
+ for (b = 0; b < numrecs; b++) {
+ xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
+ *count += xfs_bmbt_get_blockcount(frp);
+ }
+}
+
+/*
+ * Count leaf blocks given a range of extent records originally
+ * in btree format.
+ */
+STATIC void
+xfs_bmap_disk_count_leaves(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
+ int numrecs,
+ int *count)
+{
+ int b;
+ xfs_bmbt_rec_t *frp;
+
+ for (b = 1; b <= numrecs; b++) {
+ frp = XFS_BMBT_REC_ADDR(mp, block, b);
+ *count += xfs_bmbt_disk_get_blockcount(frp);
+ }
+}
+
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will alays punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+ struct xfs_inode *ip,
+ xfs_fileoff_t start_fsb,
+ xfs_fileoff_t length)
+{
+ xfs_fileoff_t remaining = length;
+ int error = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ do {
+ int done;
+ xfs_bmbt_irec_t imap;
+ int nimaps = 1;
+ xfs_fsblock_t firstblock;
+ xfs_bmap_free_t flist;
+
+ /*
+ * Map the range first and check that it is a delalloc extent
+ * before trying to unmap the range. Otherwise we will be
+ * trying to remove a real extent (which requires a
+ * transaction) or a hole, which is probably a bad idea...
+ */
+ error = xfs_bmapi(NULL, ip, start_fsb, 1,
+ XFS_BMAPI_ENTIRE, NULL, 0, &imap,
+ &nimaps, NULL);
+
+ if (error) {
+ /* something screwed, just bail */
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ xfs_alert(ip->i_mount,
+ "Failed delalloc mapping lookup ino %lld fsb %lld.",
+ ip->i_ino, start_fsb);
+ }
+ break;
+ }
+ if (!nimaps) {
+ /* nothing there */
+ goto next_block;
+ }
+ if (imap.br_startblock != DELAYSTARTBLOCK) {
+ /* been converted, ignore */
+ goto next_block;
+ }
+ WARN_ON(imap.br_blockcount == 0);
+
+ /*
+ * Note: while we initialise the firstblock/flist pair, they
+ * should never be used because blocks should never be
+ * allocated or freed for a delalloc extent and hence we need
+ * don't cancel or finish them after the xfs_bunmapi() call.
+ */
+ xfs_bmap_init(&flist, &firstblock);
+ error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+ &flist, &done);
+ if (error)
+ break;
+
+ ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+ start_fsb++;
+ remaining--;
+ } while(remaining > 0);
+
+ return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
new file mode 100644
index 00000000..c62234bd
--- /dev/null
+++ b/fs/xfs/xfs_bmap.h
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_BMAP_H__
+#define __XFS_BMAP_H__
+
+struct getbmap;
+struct xfs_bmbt_irec;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+extern kmem_zone_t *xfs_bmap_free_item_zone;
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+typedef struct xfs_bmap_free_item
+{
+ xfs_fsblock_t xbfi_startblock;/* starting fs block number */
+ xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */
+ struct xfs_bmap_free_item *xbfi_next; /* link to next entry */
+} xfs_bmap_free_item_t;
+
+/*
+ * Header for free extent list.
+ *
+ * xbf_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent. In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs. In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0. If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
+ */
+typedef struct xfs_bmap_free
+{
+ xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */
+ int xbf_count; /* count of items on list */
+ int xbf_low; /* alloc in low mode */
+} xfs_bmap_free_t;
+
+#define XFS_BMAP_MAX_NMAP 4
+
+/*
+ * Flags for xfs_bmapi
+ */
+#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */
+#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */
+#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
+ /* combine contig. space */
+#define XFS_BMAPI_CONTIG 0x100 /* must allocate only one extent */
+/*
+ * unwritten extent conversion - this needs write cache flushing and no additional
+ * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
+ * from written to unwritten, otherwise convert from unwritten to written.
+ */
+#define XFS_BMAPI_CONVERT 0x200
+
+#define XFS_BMAPI_FLAGS \
+ { XFS_BMAPI_WRITE, "WRITE" }, \
+ { XFS_BMAPI_DELAY, "DELAY" }, \
+ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
+ { XFS_BMAPI_METADATA, "METADATA" }, \
+ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
+ { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
+ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
+ { XFS_BMAPI_CONTIG, "CONTIG" }, \
+ { XFS_BMAPI_CONVERT, "CONVERT" }
+
+
+static inline int xfs_bmapi_aflag(int w)
+{
+ return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
+}
+
+/*
+ * Special values for xfs_bmbt_irec_t br_startblock field.
+ */
+#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL)
+#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL)
+
+static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
+{
+ ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
+ (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
+}
+
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+typedef struct xfs_bmalloca {
+ xfs_fsblock_t firstblock; /* i/o first block allocated */
+ xfs_fsblock_t rval; /* starting block of new extent */
+ xfs_fileoff_t off; /* offset in file filling in */
+ struct xfs_trans *tp; /* transaction pointer */
+ struct xfs_inode *ip; /* incore inode pointer */
+ struct xfs_bmbt_irec *prevp; /* extent before the new one */
+ struct xfs_bmbt_irec *gotp; /* extent after, or delayed */
+ xfs_extlen_t alen; /* i/o length asked/allocated */
+ xfs_extlen_t total; /* total blocks needed for xaction */
+ xfs_extlen_t minlen; /* minimum allocation size (blocks) */
+ xfs_extlen_t minleft; /* amount must be left after alloc */
+ char eof; /* set if allocating past last extent */
+ char wasdel; /* replacing a delayed allocation */
+ char userdata;/* set if is user data */
+ char low; /* low on space, using seq'l ags */
+ char aeof; /* allocated space at eof */
+ char conv; /* overwriting unwritten extents */
+} xfs_bmalloca_t;
+
+/*
+ * Flags for xfs_bmap_add_extent*.
+ */
+#define BMAP_LEFT_CONTIG (1 << 0)
+#define BMAP_RIGHT_CONTIG (1 << 1)
+#define BMAP_LEFT_FILLING (1 << 2)
+#define BMAP_RIGHT_FILLING (1 << 3)
+#define BMAP_LEFT_DELAY (1 << 4)
+#define BMAP_RIGHT_DELAY (1 << 5)
+#define BMAP_LEFT_VALID (1 << 6)
+#define BMAP_RIGHT_VALID (1 << 7)
+#define BMAP_ATTRFORK (1 << 8)
+
+#define XFS_BMAP_EXT_FLAGS \
+ { BMAP_LEFT_CONTIG, "LC" }, \
+ { BMAP_RIGHT_CONTIG, "RC" }, \
+ { BMAP_LEFT_FILLING, "LF" }, \
+ { BMAP_RIGHT_FILLING, "RF" }, \
+ { BMAP_ATTRFORK, "ATTR" }
+
+/*
+ * Add bmap trace insert entries for all the contents of the extent list.
+ *
+ * Quite excessive tracing. Only do this for debug builds.
+ */
+#if defined(__KERNEL) && defined(DEBUG)
+void
+xfs_bmap_trace_exlist(
+ struct xfs_inode *ip, /* incore inode pointer */
+ xfs_extnum_t cnt, /* count of entries in list */
+ int whichfork,
+ unsigned long caller_ip); /* data or attr fork */
+#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
+ xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_)
+#else
+#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
+#endif
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int /* error code */
+xfs_bmap_add_attrfork(
+ struct xfs_inode *ip, /* incore inode pointer */
+ int size, /* space needed for new attribute */
+ int rsvd); /* flag for reserved block allocation */
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+ xfs_fsblock_t bno, /* fs block number of extent */
+ xfs_filblks_t len, /* length of extent */
+ xfs_bmap_free_t *flist, /* list of extents */
+ struct xfs_mount *mp); /* mount point structure */
+
+/*
+ * Routine to clean up the free list data structure when
+ * an error occurs during a transaction.
+ */
+void
+xfs_bmap_cancel(
+ xfs_bmap_free_t *flist); /* free list to clean up */
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem. Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+ struct xfs_mount *mp, /* file system mount structure */
+ int whichfork); /* data or attr fork */
+
+/*
+ * Returns the file-relative block number of the first unused block in the file.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ */
+int /* error */
+xfs_bmap_first_unused(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_extlen_t len, /* size of hole to find */
+ xfs_fileoff_t *unused, /* unused block num */
+ int whichfork); /* data or attr fork */
+
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int /* error */
+xfs_bmap_last_before(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t *last_block, /* last block */
+ int whichfork); /* data or attr fork */
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file. This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int /* error */
+xfs_bmap_last_offset(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t *unused, /* last block num */
+ int whichfork); /* data or attr fork */
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not. For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int
+xfs_bmap_one_block(
+ struct xfs_inode *ip, /* incore inode */
+ int whichfork); /* data or attr fork */
+
+/*
+ * Read in the extents to iu_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in.
+ */
+int /* error */
+xfs_bmap_read_extents(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ int whichfork); /* data or attr fork */
+
+/*
+ * Map file blocks to filesystem blocks.
+ * File range is given by the bno/len pair.
+ * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
+ * into a hole or past eof.
+ * Only allocates blocks from a single allocation group,
+ * to avoid locking problems.
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int /* error */
+xfs_bmapi(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t bno, /* starting file offs. mapped */
+ xfs_filblks_t len, /* length to map in file */
+ int flags, /* XFS_BMAPI_... */
+ xfs_fsblock_t *firstblock, /* first allocated block
+ controls a.g. for allocs */
+ xfs_extlen_t total, /* total blocks needed */
+ struct xfs_bmbt_irec *mval, /* output: map values */
+ int *nmap, /* i/o: mval size/count */
+ xfs_bmap_free_t *flist); /* i/o: list extents to free */
+
+/*
+ * Map file blocks to filesystem blocks, simple version.
+ * One block only, read-only.
+ * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
+ * For the other flag values, the effect is as if XFS_BMAPI_METADATA
+ * was set and all the others were clear.
+ */
+int /* error */
+xfs_bmapi_single(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ int whichfork, /* data or attr fork */
+ xfs_fsblock_t *fsb, /* output: mapped block */
+ xfs_fileoff_t bno); /* starting file offs. mapped */
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value. If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int /* error */
+xfs_bunmapi(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t bno, /* starting offset to unmap */
+ xfs_filblks_t len, /* length to unmap in file */
+ int flags, /* XFS_BMAPI_... */
+ xfs_extnum_t nexts, /* number of extents max */
+ xfs_fsblock_t *firstblock, /* first allocated block
+ controls a.g. for allocs */
+ xfs_bmap_free_t *flist, /* i/o: list extents to free */
+ int *done); /* set if not done yet */
+
+/*
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field.
+ */
+int
+xfs_check_nostate_extents(
+ struct xfs_ifork *ifp,
+ xfs_extnum_t idx,
+ xfs_extnum_t num);
+
+uint
+xfs_default_attroffset(
+ struct xfs_inode *ip);
+
+#ifdef __KERNEL__
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller. Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.
+ *
+ * Return 1 if the given transaction was committed and a new one allocated,
+ * and 0 otherwise.
+ */
+int /* error */
+xfs_bmap_finish(
+ struct xfs_trans **tp, /* transaction pointer addr */
+ xfs_bmap_free_t *flist, /* i/o: list extents to free */
+ int *committed); /* xact committed or not */
+
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
+ */
+int /* error code */
+xfs_getbmap(
+ xfs_inode_t *ip,
+ struct getbmapx *bmv, /* user bmap structure */
+ xfs_bmap_format_t formatter, /* format to user */
+ void *arg); /* formatter arg */
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary
+ */
+int
+xfs_bmap_eof(
+ struct xfs_inode *ip,
+ xfs_fileoff_t endoff,
+ int whichfork,
+ int *eof);
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int
+xfs_bmap_count_blocks(
+ xfs_trans_t *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ int *count);
+
+int
+xfs_bmap_punch_delalloc_range(
+ struct xfs_inode *ip,
+ xfs_fileoff_t start_fsb,
+ xfs_fileoff_t length);
+#endif /* __KERNEL__ */
+
+#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
new file mode 100644
index 00000000..87d3c10b
--- /dev/null
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -0,0 +1,919 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+
+/*
+ * Determine the extent state.
+ */
+/* ARGSUSED */
+STATIC xfs_exntst_t
+xfs_extent_state(
+ xfs_filblks_t blks,
+ int extent_flag)
+{
+ if (extent_flag) {
+ ASSERT(blks != 0); /* saved for DMIG */
+ return XFS_EXT_UNWRITTEN;
+ }
+ return XFS_EXT_NORM;
+}
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+void
+xfs_bmdr_to_bmbt(
+ struct xfs_mount *mp,
+ xfs_bmdr_block_t *dblock,
+ int dblocklen,
+ struct xfs_btree_block *rblock,
+ int rblocklen)
+{
+ int dmxr;
+ xfs_bmbt_key_t *fkp;
+ __be64 *fpp;
+ xfs_bmbt_key_t *tkp;
+ __be64 *tpp;
+
+ rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
+ rblock->bb_level = dblock->bb_level;
+ ASSERT(be16_to_cpu(rblock->bb_level) > 0);
+ rblock->bb_numrecs = dblock->bb_numrecs;
+ rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+ rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+ dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+ fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+ tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+ fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+ tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+ dmxr = be16_to_cpu(dblock->bb_numrecs);
+ memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+ memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+/*
+ * Convert a compressed bmap extent record to an uncompressed form.
+ * This code must be in sync with the routines xfs_bmbt_get_startoff,
+ * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
+ */
+STATIC void
+__xfs_bmbt_get_all(
+ __uint64_t l0,
+ __uint64_t l1,
+ xfs_bmbt_irec_t *s)
+{
+ int ext_flag;
+ xfs_exntst_t st;
+
+ ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
+ s->br_startoff = ((xfs_fileoff_t)l0 &
+ xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+#if XFS_BIG_BLKNOS
+ s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
+ (((xfs_fsblock_t)l1) >> 21);
+#else
+#ifdef DEBUG
+ {
+ xfs_dfsbno_t b;
+
+ b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
+ (((xfs_dfsbno_t)l1) >> 21);
+ ASSERT((b >> 32) == 0 || isnulldstartblock(b));
+ s->br_startblock = (xfs_fsblock_t)b;
+ }
+#else /* !DEBUG */
+ s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
+#endif /* DEBUG */
+#endif /* XFS_BIG_BLKNOS */
+ s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
+ /* This is xfs_extent_state() in-line */
+ if (ext_flag) {
+ ASSERT(s->br_blockcount != 0); /* saved for DMIG */
+ st = XFS_EXT_UNWRITTEN;
+ } else
+ st = XFS_EXT_NORM;
+ s->br_state = st;
+}
+
+void
+xfs_bmbt_get_all(
+ xfs_bmbt_rec_host_t *r,
+ xfs_bmbt_irec_t *s)
+{
+ __xfs_bmbt_get_all(r->l0, r->l1, s);
+}
+
+/*
+ * Extract the blockcount field from an in memory bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+ xfs_bmbt_rec_host_t *r)
+{
+ return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startblock field from an in memory bmap extent record.
+ */
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+ xfs_bmbt_rec_host_t *r)
+{
+#if XFS_BIG_BLKNOS
+ return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
+ (((xfs_fsblock_t)r->l1) >> 21);
+#else
+#ifdef DEBUG
+ xfs_dfsbno_t b;
+
+ b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
+ (((xfs_dfsbno_t)r->l1) >> 21);
+ ASSERT((b >> 32) == 0 || isnulldstartblock(b));
+ return (xfs_fsblock_t)b;
+#else /* !DEBUG */
+ return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
+#endif /* DEBUG */
+#endif /* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Extract the startoff field from an in memory bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+ xfs_bmbt_rec_host_t *r)
+{
+ return ((xfs_fileoff_t)r->l0 &
+ xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+xfs_exntst_t
+xfs_bmbt_get_state(
+ xfs_bmbt_rec_host_t *r)
+{
+ int ext_flag;
+
+ ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
+ return xfs_extent_state(xfs_bmbt_get_blockcount(r),
+ ext_flag);
+}
+
+/*
+ * Extract the blockcount field from an on disk bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+ xfs_bmbt_rec_t *r)
+{
+ return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
+}
+
+/*
+ * Extract the startoff field from a disk format bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+ xfs_bmbt_rec_t *r)
+{
+ return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
+ xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+
+/*
+ * Set all the fields in a bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_set_allf(
+ xfs_bmbt_rec_host_t *r,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state)
+{
+ int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+
+ ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+ ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+ ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+
+#if XFS_BIG_BLKNOS
+ ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
+ r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)startoff << 9) |
+ ((xfs_bmbt_rec_base_t)startblock >> 43);
+ r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+#else /* !XFS_BIG_BLKNOS */
+ if (isnullstartblock(startblock)) {
+ r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)startoff << 9) |
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+ r->l1 = xfs_mask64hi(11) |
+ ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+ } else {
+ r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)startoff << 9);
+ r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+ }
+#endif /* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_set_all(
+ xfs_bmbt_rec_host_t *r,
+ xfs_bmbt_irec_t *s)
+{
+ xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock,
+ s->br_blockcount, s->br_state);
+}
+
+
+/*
+ * Set all the fields in a disk format bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_disk_set_allf(
+ xfs_bmbt_rec_t *r,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state)
+{
+ int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
+
+ ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
+ ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+ ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+
+#if XFS_BIG_BLKNOS
+ ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+
+ r->l0 = cpu_to_be64(
+ ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)startoff << 9) |
+ ((xfs_bmbt_rec_base_t)startblock >> 43));
+ r->l1 = cpu_to_be64(
+ ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+#else /* !XFS_BIG_BLKNOS */
+ if (isnullstartblock(startblock)) {
+ r->l0 = cpu_to_be64(
+ ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)startoff << 9) |
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+ r->l1 = cpu_to_be64(xfs_mask64hi(11) |
+ ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+ } else {
+ r->l0 = cpu_to_be64(
+ ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+ ((xfs_bmbt_rec_base_t)startoff << 9));
+ r->l1 = cpu_to_be64(
+ ((xfs_bmbt_rec_base_t)startblock << 21) |
+ ((xfs_bmbt_rec_base_t)blockcount &
+ (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
+ }
+#endif /* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+STATIC void
+xfs_bmbt_disk_set_all(
+ xfs_bmbt_rec_t *r,
+ xfs_bmbt_irec_t *s)
+{
+ xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock,
+ s->br_blockcount, s->br_state);
+}
+
+/*
+ * Set the blockcount field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_blockcount(
+ xfs_bmbt_rec_host_t *r,
+ xfs_filblks_t v)
+{
+ ASSERT((v & xfs_mask64hi(43)) == 0);
+ r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
+ (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
+}
+
+/*
+ * Set the startblock field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startblock(
+ xfs_bmbt_rec_host_t *r,
+ xfs_fsblock_t v)
+{
+#if XFS_BIG_BLKNOS
+ ASSERT((v & xfs_mask64hi(12)) == 0);
+ r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
+ (xfs_bmbt_rec_base_t)(v >> 43);
+ r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
+ (xfs_bmbt_rec_base_t)(v << 21);
+#else /* !XFS_BIG_BLKNOS */
+ if (isnullstartblock(v)) {
+ r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+ r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
+ ((xfs_bmbt_rec_base_t)v << 21) |
+ (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+ } else {
+ r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+ r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
+ (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
+ }
+#endif /* XFS_BIG_BLKNOS */
+}
+
+/*
+ * Set the startoff field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startoff(
+ xfs_bmbt_rec_host_t *r,
+ xfs_fileoff_t v)
+{
+ ASSERT((v & xfs_mask64hi(9)) == 0);
+ r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
+ ((xfs_bmbt_rec_base_t)v << 9) |
+ (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+}
+
+/*
+ * Set the extent state field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_state(
+ xfs_bmbt_rec_host_t *r,
+ xfs_exntst_t v)
+{
+ ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
+ if (v == XFS_EXT_NORM)
+ r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
+ else
+ r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_bmbt_to_bmdr(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *rblock,
+ int rblocklen,
+ xfs_bmdr_block_t *dblock,
+ int dblocklen)
+{
+ int dmxr;
+ xfs_bmbt_key_t *fkp;
+ __be64 *fpp;
+ xfs_bmbt_key_t *tkp;
+ __be64 *tpp;
+
+ ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
+ ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO);
+ ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO);
+ ASSERT(be16_to_cpu(rblock->bb_level) > 0);
+ dblock->bb_level = rblock->bb_level;
+ dblock->bb_numrecs = rblock->bb_numrecs;
+ dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+ fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+ tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+ fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+ tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+ dmxr = be16_to_cpu(dblock->bb_numrecs);
+ memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+ memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
+}
+
+/*
+ * Check extent records, which have just been read, for
+ * any bit in the extent flag field. ASSERT on debug
+ * kernels, as this condition should not occur.
+ * Return an error condition (1) if any flags found,
+ * otherwise return 0.
+ */
+
+int
+xfs_check_nostate_extents(
+ xfs_ifork_t *ifp,
+ xfs_extnum_t idx,
+ xfs_extnum_t num)
+{
+ for (; num > 0; num--, idx++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+ if ((ep->l0 >>
+ (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
+ ASSERT(0);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_btree_cur *new;
+
+ new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+
+ /*
+ * Copy the firstblock, flist, and flags values,
+ * since init cursor doesn't get them.
+ */
+ new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+ new->bc_private.b.flist = cur->bc_private.b.flist;
+ new->bc_private.b.flags = cur->bc_private.b.flags;
+
+ return new;
+}
+
+STATIC void
+xfs_bmbt_update_cursor(
+ struct xfs_btree_cur *src,
+ struct xfs_btree_cur *dst)
+{
+ ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+ (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+ ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+
+ dst->bc_private.b.allocated += src->bc_private.b.allocated;
+ dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+
+ src->bc_private.b.allocated = 0;
+}
+
+STATIC int
+xfs_bmbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int length,
+ int *stat)
+{
+ xfs_alloc_arg_t args; /* block allocation args */
+ int error; /* error return value */
+
+ memset(&args, 0, sizeof(args));
+ args.tp = cur->bc_tp;
+ args.mp = cur->bc_mp;
+ args.fsbno = cur->bc_private.b.firstblock;
+ args.firstblock = args.fsbno;
+
+ if (args.fsbno == NULLFSBLOCK) {
+ args.fsbno = be64_to_cpu(start->l);
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ /*
+ * Make sure there is sufficient room left in the AG to
+ * complete a full tree split for an extent insert. If
+ * we are converting the middle part of an extent then
+ * we may need space for two tree splits.
+ *
+ * We are relying on the caller to make the correct block
+ * reservation for this operation to succeed. If the
+ * reservation amount is insufficient then we may fail a
+ * block allocation here and corrupt the filesystem.
+ */
+ args.minleft = xfs_trans_get_block_res(args.tp);
+ } else if (cur->bc_private.b.flist->xbf_low) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ }
+
+ args.minlen = args.maxlen = args.prod = 1;
+ args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+ if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+ error = XFS_ERROR(ENOSPC);
+ goto error0;
+ }
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto error0;
+
+ if (args.fsbno == NULLFSBLOCK && args.minleft) {
+ /*
+ * Could not find an AG with enough free space to satisfy
+ * a full btree split. Try again without minleft and if
+ * successful activate the lowspace algorithm.
+ */
+ args.fsbno = 0;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.minleft = 0;
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto error0;
+ cur->bc_private.b.flist->xbf_low = 1;
+ }
+ if (args.fsbno == NULLFSBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+ ASSERT(args.len == 1);
+ cur->bc_private.b.firstblock = args.fsbno;
+ cur->bc_private.b.allocated++;
+ cur->bc_private.b.ip->i_d.di_nblocks++;
+ xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+ xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+ XFS_TRANS_DQ_BCOUNT, 1L);
+
+ new->l = cpu_to_be64(args.fsbno);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+ error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+STATIC int
+xfs_bmbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_trans *tp = cur->bc_tp;
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+
+ xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+ ip->i_d.di_nblocks--;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+ xfs_trans_binval(tp, bp);
+ return 0;
+}
+
+STATIC int
+xfs_bmbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp;
+
+ ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+ cur->bc_private.b.whichfork);
+
+ return xfs_bmbt_maxrecs(cur->bc_mp,
+ ifp->if_broot_bytes, level == 0) / 2;
+ }
+
+ return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+
+int
+xfs_bmbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp;
+
+ ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+ cur->bc_private.b.whichfork);
+
+ return xfs_bmbt_maxrecs(cur->bc_mp,
+ ifp->if_broot_bytes, level == 0);
+ }
+
+ return cur->bc_mp->m_bmap_dmxr[level != 0];
+
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it. After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level != cur->bc_nlevels - 1)
+ return cur->bc_mp->m_bmap_dmxr[level != 0];
+ return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
+ level == 0);
+}
+
+STATIC void
+xfs_bmbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->bmbt.br_startoff =
+ cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_key(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ ASSERT(key->bmbt.br_startoff != 0);
+
+ xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+ 0, 0, XFS_EXT_NORM);
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ptr->l = 0;
+}
+
+STATIC __int64_t
+xfs_bmbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+ cur->bc_rec.b.br_startoff;
+}
+
+#ifdef DEBUG
+STATIC int
+xfs_bmbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return be64_to_cpu(k1->bmbt.br_startoff) <
+ be64_to_cpu(k2->bmbt.br_startoff);
+}
+
+STATIC int
+xfs_bmbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+ xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+ xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif /* DEBUG */
+
+#ifdef XFS_BTREE_TRACE
+ktrace_t *xfs_bmbt_trace_buf;
+
+STATIC void
+xfs_bmbt_trace_enter(
+ struct xfs_btree_cur *cur,
+ const char *func,
+ char *s,
+ int type,
+ int line,
+ __psunsigned_t a0,
+ __psunsigned_t a1,
+ __psunsigned_t a2,
+ __psunsigned_t a3,
+ __psunsigned_t a4,
+ __psunsigned_t a5,
+ __psunsigned_t a6,
+ __psunsigned_t a7,
+ __psunsigned_t a8,
+ __psunsigned_t a9,
+ __psunsigned_t a10)
+{
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+ int whichfork = cur->bc_private.b.whichfork;
+
+ ktrace_enter(xfs_bmbt_trace_buf,
+ (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+ (void *)func, (void *)s, (void *)ip, (void *)cur,
+ (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+ (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+ (void *)a8, (void *)a9, (void *)a10);
+}
+
+STATIC void
+xfs_bmbt_trace_cursor(
+ struct xfs_btree_cur *cur,
+ __uint32_t *s0,
+ __uint64_t *l0,
+ __uint64_t *l1)
+{
+ struct xfs_bmbt_rec_host r;
+
+ xfs_bmbt_set_all(&r, &cur->bc_rec.b);
+
+ *s0 = (cur->bc_nlevels << 24) |
+ (cur->bc_private.b.flags << 16) |
+ cur->bc_private.b.allocated;
+ *l0 = r.l0;
+ *l1 = r.l1;
+}
+
+STATIC void
+xfs_bmbt_trace_key(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key,
+ __uint64_t *l0,
+ __uint64_t *l1)
+{
+ *l0 = be64_to_cpu(key->bmbt.br_startoff);
+ *l1 = 0;
+}
+
+/* Endian flipping versions of the bmbt extraction functions */
+STATIC void
+xfs_bmbt_disk_get_all(
+ xfs_bmbt_rec_t *r,
+ xfs_bmbt_irec_t *s)
+{
+ __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+ get_unaligned_be64(&r->l1), s);
+}
+
+STATIC void
+xfs_bmbt_trace_record(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ __uint64_t *l0,
+ __uint64_t *l1,
+ __uint64_t *l2)
+{
+ struct xfs_bmbt_irec irec;
+
+ xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+ *l0 = irec.br_startoff;
+ *l1 = irec.br_startblock;
+ *l2 = irec.br_blockcount;
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+ .rec_len = sizeof(xfs_bmbt_rec_t),
+ .key_len = sizeof(xfs_bmbt_key_t),
+
+ .dup_cursor = xfs_bmbt_dup_cursor,
+ .update_cursor = xfs_bmbt_update_cursor,
+ .alloc_block = xfs_bmbt_alloc_block,
+ .free_block = xfs_bmbt_free_block,
+ .get_maxrecs = xfs_bmbt_get_maxrecs,
+ .get_minrecs = xfs_bmbt_get_minrecs,
+ .get_dmaxrecs = xfs_bmbt_get_dmaxrecs,
+ .init_key_from_rec = xfs_bmbt_init_key_from_rec,
+ .init_rec_from_key = xfs_bmbt_init_rec_from_key,
+ .init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
+ .key_diff = xfs_bmbt_key_diff,
+
+#ifdef DEBUG
+ .keys_inorder = xfs_bmbt_keys_inorder,
+ .recs_inorder = xfs_bmbt_recs_inorder,
+#endif
+
+#ifdef XFS_BTREE_TRACE
+ .trace_enter = xfs_bmbt_trace_enter,
+ .trace_cursor = xfs_bmbt_trace_cursor,
+ .trace_key = xfs_bmbt_trace_key,
+ .trace_record = xfs_bmbt_trace_record,
+#endif
+};
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur * /* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* inode owning the btree */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ cur->bc_btnum = XFS_BTNUM_BMAP;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+ cur->bc_ops = &xfs_bmbt_ops;
+ cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+
+ cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+ cur->bc_private.b.ip = ip;
+ cur->bc_private.b.firstblock = NULLFSBLOCK;
+ cur->bc_private.b.flist = NULL;
+ cur->bc_private.b.allocated = 0;
+ cur->bc_private.b.flags = 0;
+ cur->bc_private.b.whichfork = whichfork;
+
+ return cur;
+}
+
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+
+ if (leaf)
+ return blocklen / sizeof(xfs_bmbt_rec_t);
+ return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= sizeof(xfs_bmdr_block_t);
+
+ if (leaf)
+ return blocklen / sizeof(xfs_bmdr_rec_t);
+ return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
new file mode 100644
index 00000000..0e66c4ea
--- /dev/null
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_BMAP_BTREE_H__
+#define __XFS_BMAP_BTREE_H__
+
+#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
+
+struct xfs_btree_cur;
+struct xfs_btree_block;
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block {
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ * l0:63 is an extent flag (value 1 indicates non-normal).
+ * l0:9-62 are startoff.
+ * l0:0-8 and l1:21-63 are startblock.
+ * l1:0-20 are blockcount.
+ */
+#define BMBT_EXNTFLAG_BITLEN 1
+#define BMBT_STARTOFF_BITLEN 54
+#define BMBT_STARTBLOCK_BITLEN 52
+#define BMBT_BLOCKCOUNT_BITLEN 21
+
+typedef struct xfs_bmbt_rec {
+ __be64 l0, l1;
+} xfs_bmbt_rec_t;
+
+typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
+
+typedef struct xfs_bmbt_rec_host {
+ __uint64_t l0, l1;
+} xfs_bmbt_rec_host_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS 17
+#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20)
+#define DSTARTBLOCKMASKBITS (15 + 20)
+#define STARTBLOCKMASK \
+ (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#define DSTARTBLOCKMASK \
+ (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+
+static inline int isnullstartblock(xfs_fsblock_t x)
+{
+ return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
+}
+
+static inline int isnulldstartblock(xfs_dfsbno_t x)
+{
+ return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
+}
+
+static inline xfs_fsblock_t nullstartblock(int k)
+{
+ ASSERT(k < (1 << STARTBLOCKVALBITS));
+ return STARTBLOCKMASK | (k);
+}
+
+static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
+{
+ return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
+}
+
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+ XFS_EXTFMT_NOSTATE = 0,
+ XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+
+/*
+ * Possible extent states.
+ */
+typedef enum {
+ XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+ XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
+} xfs_exntst_t;
+
+/*
+ * Extent state and extent format macros.
+ */
+#define XFS_EXTFMT_INODE(x) \
+ (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
+ XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
+#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN)
+
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+ xfs_fileoff_t br_startoff; /* starting file offset */
+ xfs_fsblock_t br_startblock; /* starting block number */
+ xfs_filblks_t br_blockcount; /* number of blocks */
+ xfs_exntst_t br_state; /* extent state */
+} xfs_bmbt_irec_t;
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key {
+ __be64 br_startoff; /* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+
+/*
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
+ */
+#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN
+
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
+ ((xfs_bmbt_rec_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
+
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
+ ((xfs_bmbt_key_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_bmbt_key_t)))
+
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_bmbt_ptr_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_bmbt_key_t) + \
+ ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
+
+#define XFS_BMDR_REC_ADDR(block, index) \
+ ((xfs_bmdr_rec_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
+
+#define XFS_BMDR_KEY_ADDR(block, index) \
+ ((xfs_bmdr_key_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ ((index) - 1) * sizeof(xfs_bmdr_key_t)))
+
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
+ ((xfs_bmdr_ptr_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ (maxrecs) * sizeof(xfs_bmdr_key_t) + \
+ ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
+
+/*
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
+ XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
+
+#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
+ (int)(XFS_BTREE_LBLOCK_LEN + \
+ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+
+#define XFS_BMAP_BROOT_SPACE(bb) \
+ (XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
+#define XFS_BMDR_SPACE_CALC(nrecs) \
+ (int)(sizeof(xfs_bmdr_block_t) + \
+ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+
+/*
+ * Maximum number of bmap btree levels.
+ */
+#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)])
+
+/*
+ * Prototypes for xfs_bmap.c to call.
+ */
+extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
+ struct xfs_btree_block *, int);
+extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
+extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
+extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
+extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
+
+extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
+extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
+
+extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
+extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
+ xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v);
+extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
+extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
+extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
+
+extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
+ xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
+
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
+ xfs_bmdr_block_t *, int);
+
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+ struct xfs_trans *, struct xfs_inode *, int);
+
+
+#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
new file mode 100644
index 00000000..2f9e97c1
--- /dev/null
+++ b/fs/xfs/xfs_btree.c
@@ -0,0 +1,3679 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * Cursor allocation zone.
+ */
+kmem_zone_t *xfs_btree_cur_zone;
+
+/*
+ * Btree magic numbers.
+ */
+const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
+ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
+};
+
+
+STATIC int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* btree long form block pointer */
+ int level, /* level of the btree block */
+ struct xfs_buf *bp) /* buffer for block, if any */
+{
+ int lblock_ok; /* block passes checks */
+ struct xfs_mount *mp; /* file system mount point */
+
+ mp = cur->bc_mp;
+ lblock_ok =
+ be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
+ be16_to_cpu(block->bb_level) == level &&
+ be16_to_cpu(block->bb_numrecs) <=
+ cur->bc_ops->get_maxrecs(cur, level) &&
+ block->bb_u.l.bb_leftsib &&
+ (be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO ||
+ XFS_FSB_SANITY_CHECK(mp,
+ be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+ block->bb_u.l.bb_rightsib &&
+ (be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO ||
+ XFS_FSB_SANITY_CHECK(mp,
+ be64_to_cpu(block->bb_u.l.bb_rightsib)));
+ if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+ XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+ XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
+ if (bp)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
+ mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ return 0;
+}
+
+STATIC int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* btree short form block pointer */
+ int level, /* level of the btree block */
+ struct xfs_buf *bp) /* buffer containing block */
+{
+ struct xfs_buf *agbp; /* buffer for ag. freespace struct */
+ struct xfs_agf *agf; /* ag. freespace structure */
+ xfs_agblock_t agflen; /* native ag. freespace length */
+ int sblock_ok; /* block passes checks */
+
+ agbp = cur->bc_private.a.agbp;
+ agf = XFS_BUF_TO_AGF(agbp);
+ agflen = be32_to_cpu(agf->agf_length);
+ sblock_ok =
+ be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
+ be16_to_cpu(block->bb_level) == level &&
+ be16_to_cpu(block->bb_numrecs) <=
+ cur->bc_ops->get_maxrecs(cur, level) &&
+ (be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK ||
+ be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
+ block->bb_u.s.bb_leftsib &&
+ (be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK ||
+ be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
+ block->bb_u.s.bb_rightsib;
+ if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
+ XFS_ERRTAG_BTREE_CHECK_SBLOCK,
+ XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
+ if (bp)
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
+ XFS_ERRLEVEL_LOW, cur->bc_mp, block);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ return 0;
+}
+
+/*
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* generic btree block pointer */
+ int level, /* level of the btree block */
+ struct xfs_buf *bp) /* buffer containing block, if any */
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return xfs_btree_check_lblock(cur, block, level, bp);
+ else
+ return xfs_btree_check_sblock(cur, block, level, bp);
+}
+
+/*
+ * Check that (long) pointer is ok.
+ */
+int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_dfsbno_t bno, /* btree block disk address */
+ int level) /* btree block level */
+{
+ XFS_WANT_CORRUPTED_RETURN(
+ level > 0 &&
+ bno != NULLDFSBNO &&
+ XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+ return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* btree block disk address */
+ int level) /* btree block level */
+{
+ xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks;
+
+ XFS_WANT_CORRUPTED_RETURN(
+ level > 0 &&
+ bno != NULLAGBLOCK &&
+ bno != 0 &&
+ bno < agblocks);
+ return 0;
+}
+
+/*
+ * Check that block ptr is ok.
+ */
+STATIC int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ union xfs_btree_ptr *ptr, /* btree block disk address */
+ int index, /* offset from ptr to check */
+ int level) /* btree block level */
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ return xfs_btree_check_lptr(cur,
+ be64_to_cpu((&ptr->l)[index]), level);
+ } else {
+ return xfs_btree_check_sptr(cur,
+ be32_to_cpu((&ptr->s)[index]), level);
+ }
+}
+#endif
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int error) /* del because of error */
+{
+ int i; /* btree level */
+
+ /*
+ * Clear the buffer pointers, and release the buffers.
+ * If we're doing this in the face of an error, we
+ * need to make sure to inspect all of the entries
+ * in the bc_bufs array for buffers to be unlocked.
+ * This is because some of the btree code works from
+ * level n down to 0, and if we get an error along
+ * the way we won't have initialized all the entries
+ * down to 0.
+ */
+ for (i = 0; i < cur->bc_nlevels; i++) {
+ if (cur->bc_bufs[i])
+ xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
+ else if (!error)
+ break;
+ }
+ /*
+ * Can't free a bmap cursor without having dealt with the
+ * allocated indirect blocks' accounting.
+ */
+ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
+ cur->bc_private.b.allocated == 0);
+ /*
+ * Free the cursor.
+ */
+ kmem_zone_free(xfs_btree_cur_zone, cur);
+}
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int /* error */
+xfs_btree_dup_cursor(
+ xfs_btree_cur_t *cur, /* input cursor */
+ xfs_btree_cur_t **ncur) /* output cursor */
+{
+ xfs_buf_t *bp; /* btree block's buffer pointer */
+ int error; /* error return value */
+ int i; /* level number of btree block */
+ xfs_mount_t *mp; /* mount structure for filesystem */
+ xfs_btree_cur_t *new; /* new cursor value */
+ xfs_trans_t *tp; /* transaction pointer, can be NULL */
+
+ tp = cur->bc_tp;
+ mp = cur->bc_mp;
+
+ /*
+ * Allocate a new cursor like the old one.
+ */
+ new = cur->bc_ops->dup_cursor(cur);
+
+ /*
+ * Copy the record currently in the cursor.
+ */
+ new->bc_rec = cur->bc_rec;
+
+ /*
+ * For each level current, re-get the buffer and copy the ptr value.
+ */
+ for (i = 0; i < new->bc_nlevels; i++) {
+ new->bc_ptrs[i] = cur->bc_ptrs[i];
+ new->bc_ra[i] = cur->bc_ra[i];
+ if ((bp = cur->bc_bufs[i])) {
+ if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+ xfs_btree_del_cursor(new, error);
+ *ncur = NULL;
+ return error;
+ }
+ new->bc_bufs[i] = bp;
+ ASSERT(bp);
+ ASSERT(!XFS_BUF_GETERROR(bp));
+ } else
+ new->bc_bufs[i] = NULL;
+ }
+ *ncur = new;
+ return 0;
+}
+
+/*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values. A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ * +--------+-------+-------+-------+-------+-------+-------+
+ * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ * +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * +--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ * +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers. The record and key structures are defined by the btree instances
+ * and opaque to the btree core. The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr). Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+ return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ XFS_BTREE_LBLOCK_LEN :
+ XFS_BTREE_SBLOCK_LEN;
+}
+
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+ return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ sizeof(__be64) : sizeof(__be32);
+}
+
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+ struct xfs_btree_cur *cur,
+ int n)
+{
+ return xfs_btree_block_len(cur) +
+ (n - 1) * cur->bc_ops->rec_len;
+}
+
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+ struct xfs_btree_cur *cur,
+ int n)
+{
+ return xfs_btree_block_len(cur) +
+ (n - 1) * cur->bc_ops->key_len;
+}
+
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+ struct xfs_btree_cur *cur,
+ int n,
+ int level)
+{
+ return xfs_btree_block_len(cur) +
+ cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+ (n - 1) * xfs_btree_ptr_len(cur);
+}
+
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ return (union xfs_btree_rec *)
+ ((char *)block + xfs_btree_rec_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ return (union xfs_btree_key *)
+ ((char *)block + xfs_btree_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ int level = xfs_btree_get_level(block);
+
+ ASSERT(block->bb_level != 0);
+
+ return (union xfs_btree_ptr *)
+ ((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+
+/*
+ * Get a the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_ifork *ifp;
+
+ ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+ return (struct xfs_btree_block *)ifp->if_broot;
+}
+
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be an inode btree root or from a buffer.
+ */
+STATIC struct xfs_btree_block * /* generic btree block pointer */
+xfs_btree_get_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level in btree */
+ struct xfs_buf **bpp) /* buffer containing the block */
+{
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1)) {
+ *bpp = NULL;
+ return xfs_btree_get_iroot(cur);
+ }
+
+ *bpp = cur->bc_bufs[level];
+ return XFS_BUF_TO_BLOCK(*bpp);
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+xfs_buf_t * /* buffer for fsbno */
+xfs_btree_get_bufl(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ uint lock) /* lock flags for get_buf */
+{
+ xfs_buf_t *bp; /* buffer pointer (return value) */
+ xfs_daddr_t d; /* real disk block address */
+
+ ASSERT(fsbno != NULLFSBLOCK);
+ d = XFS_FSB_TO_DADDR(mp, fsbno);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+ ASSERT(bp);
+ ASSERT(!XFS_BUF_GETERROR(bp));
+ return bp;
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+xfs_buf_t * /* buffer for agno/agbno */
+xfs_btree_get_bufs(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_agblock_t agbno, /* allocation group block number */
+ uint lock) /* lock flags for get_buf */
+{
+ xfs_buf_t *bp; /* buffer pointer (return value) */
+ xfs_daddr_t d; /* real disk block address */
+
+ ASSERT(agno != NULLAGNUMBER);
+ ASSERT(agbno != NULLAGBLOCK);
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+ ASSERT(bp);
+ ASSERT(!XFS_BUF_GETERROR(bp));
+ return bp;
+}
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int /* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int level) /* level to check */
+{
+ struct xfs_btree_block *block; /* generic btree block pointer */
+ xfs_buf_t *bp; /* buffer containing block */
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ xfs_btree_check_block(cur, block, level, bp);
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO;
+ else
+ return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK;
+}
+
+/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+STATIC int /* success=1, failure=0 */
+xfs_btree_firstrec(
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int level) /* level to change */
+{
+ struct xfs_btree_block *block; /* generic btree block pointer */
+ xfs_buf_t *bp; /* buffer containing block */
+
+ /*
+ * Get the block pointer for this level.
+ */
+ block = xfs_btree_get_block(cur, level, &bp);
+ xfs_btree_check_block(cur, block, level, bp);
+ /*
+ * It's empty, there is no such record.
+ */
+ if (!block->bb_numrecs)
+ return 0;
+ /*
+ * Set the ptr value to 1, that's the first record/key.
+ */
+ cur->bc_ptrs[level] = 1;
+ return 1;
+}
+
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level. Other levels are unaffected.
+ */
+STATIC int /* success=1, failure=0 */
+xfs_btree_lastrec(
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int level) /* level to change */
+{
+ struct xfs_btree_block *block; /* generic btree block pointer */
+ xfs_buf_t *bp; /* buffer containing block */
+
+ /*
+ * Get the block pointer for this level.
+ */
+ block = xfs_btree_get_block(cur, level, &bp);
+ xfs_btree_check_block(cur, block, level, bp);
+ /*
+ * It's empty, there is no such record.
+ */
+ if (!block->bb_numrecs)
+ return 0;
+ /*
+ * Set the ptr value to numrecs, that's the last record/key.
+ */
+ cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
+ return 1;
+}
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+ __int64_t fields, /* bitmask of fields */
+ const short *offsets, /* table of field offsets */
+ int nbits, /* number of bits to inspect */
+ int *first, /* output: first byte offset */
+ int *last) /* output: last byte offset */
+{
+ int i; /* current bit number */
+ __int64_t imask; /* mask for current bit number */
+
+ ASSERT(fields != 0);
+ /*
+ * Find the lowest bit, so the first byte offset.
+ */
+ for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+ if (imask & fields) {
+ *first = offsets[i];
+ break;
+ }
+ }
+ /*
+ * Find the highest bit, so the last byte offset.
+ */
+ for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+ if (imask & fields) {
+ *last = offsets[i + 1] - 1;
+ break;
+ }
+ }
+}
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int /* error */
+xfs_btree_read_bufl(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ uint lock, /* lock flags for read_buf */
+ xfs_buf_t **bpp, /* buffer for fsbno */
+ int refval) /* ref count value for buffer */
+{
+ xfs_buf_t *bp; /* return value */
+ xfs_daddr_t d; /* real disk block address */
+ int error;
+
+ ASSERT(fsbno != NULLFSBLOCK);
+ d = XFS_FSB_TO_DADDR(mp, fsbno);
+ if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+ mp->m_bsize, lock, &bp))) {
+ return error;
+ }
+ ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+ if (bp)
+ XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufl(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_fsblock_t fsbno, /* file system block number */
+ xfs_extlen_t count) /* count of filesystem blocks */
+{
+ xfs_daddr_t d;
+
+ ASSERT(fsbno != NULLFSBLOCK);
+ d = XFS_FSB_TO_DADDR(mp, fsbno);
+ xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufs(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_agblock_t agbno, /* allocation group block number */
+ xfs_extlen_t count) /* count of filesystem blocks */
+{
+ xfs_daddr_t d;
+
+ ASSERT(agno != NULLAGNUMBER);
+ ASSERT(agbno != NULLAGBLOCK);
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+}
+
+STATIC int
+xfs_btree_readahead_lblock(
+ struct xfs_btree_cur *cur,
+ int lr,
+ struct xfs_btree_block *block)
+{
+ int rval = 0;
+ xfs_dfsbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+ xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+ xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+ rval++;
+ }
+
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+ xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+ rval++;
+ }
+
+ return rval;
+}
+
+STATIC int
+xfs_btree_readahead_sblock(
+ struct xfs_btree_cur *cur,
+ int lr,
+ struct xfs_btree_block *block)
+{
+ int rval = 0;
+ xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+ xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+
+
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ left, 1);
+ rval++;
+ }
+
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ right, 1);
+ rval++;
+ }
+
+ return rval;
+}
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+STATIC int
+xfs_btree_readahead(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int lev, /* level in btree */
+ int lr) /* left/right bits */
+{
+ struct xfs_btree_block *block;
+
+ /*
+ * No readahead needed if we are at the root level and the
+ * btree root is stored in the inode.
+ */
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (lev == cur->bc_nlevels - 1))
+ return 0;
+
+ if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+ return 0;
+
+ cur->bc_ra[lev] |= lr;
+ block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return xfs_btree_readahead_lblock(cur, lr, block);
+ return xfs_btree_readahead_sblock(cur, lr, block);
+}
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+STATIC void
+xfs_btree_setbuf(
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int lev, /* level in btree */
+ xfs_buf_t *bp) /* new buffer to set */
+{
+ struct xfs_btree_block *b; /* btree block */
+
+ if (cur->bc_bufs[lev])
+ xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
+ cur->bc_bufs[lev] = bp;
+ cur->bc_ra[lev] = 0;
+
+ b = XFS_BUF_TO_BLOCK(bp);
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
+ cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+ if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO)
+ cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+ } else {
+ if (be32_to_cpu(b->bb_u.s.bb_leftsib) == NULLAGBLOCK)
+ cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+ if (be32_to_cpu(b->bb_u.s.bb_rightsib) == NULLAGBLOCK)
+ cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+ }
+}
+
+STATIC int
+xfs_btree_ptr_is_null(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return be64_to_cpu(ptr->l) == NULLDFSBNO;
+ else
+ return be32_to_cpu(ptr->s) == NULLAGBLOCK;
+}
+
+STATIC void
+xfs_btree_set_ptr_null(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(NULLDFSBNO);
+ else
+ ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_ptr *ptr,
+ int lr)
+{
+ ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (lr == XFS_BB_RIGHTSIB)
+ ptr->l = block->bb_u.l.bb_rightsib;
+ else
+ ptr->l = block->bb_u.l.bb_leftsib;
+ } else {
+ if (lr == XFS_BB_RIGHTSIB)
+ ptr->s = block->bb_u.s.bb_rightsib;
+ else
+ ptr->s = block->bb_u.s.bb_leftsib;
+ }
+}
+
+STATIC void
+xfs_btree_set_sibling(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_ptr *ptr,
+ int lr)
+{
+ ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (lr == XFS_BB_RIGHTSIB)
+ block->bb_u.l.bb_rightsib = ptr->l;
+ else
+ block->bb_u.l.bb_leftsib = ptr->l;
+ } else {
+ if (lr == XFS_BB_RIGHTSIB)
+ block->bb_u.s.bb_rightsib = ptr->s;
+ else
+ block->bb_u.s.bb_leftsib = ptr->s;
+ }
+}
+
+STATIC void
+xfs_btree_init_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ int numrecs,
+ struct xfs_btree_block *new) /* new block */
+{
+ new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+ new->bb_level = cpu_to_be16(level);
+ new->bb_numrecs = cpu_to_be16(numrecs);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+ new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+ } else {
+ new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ }
+}
+
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updateѕ to this record. The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level)
+{
+ union xfs_btree_ptr ptr;
+
+ if (level > 0)
+ return 0;
+ if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+ return 0;
+
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &ptr))
+ return 0;
+ return 1;
+}
+
+STATIC void
+xfs_btree_buf_to_ptr(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+ XFS_BUF_ADDR(bp)));
+ else {
+ ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
+ XFS_BUF_ADDR(bp)));
+ }
+}
+
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ ASSERT(be64_to_cpu(ptr->l) != NULLDFSBNO);
+
+ return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+ } else {
+ ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+ ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK);
+
+ return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+ be32_to_cpu(ptr->s));
+ }
+}
+
+STATIC void
+xfs_btree_set_refs(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ switch (cur->bc_btnum) {
+ case XFS_BTNUM_BNO:
+ case XFS_BTNUM_CNT:
+ XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+ break;
+ case XFS_BTNUM_INO:
+ XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+ break;
+ case XFS_BTNUM_BMAP:
+ XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+ break;
+ default:
+ ASSERT(0);
+ }
+}
+
+STATIC int
+xfs_btree_get_buf_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int flags,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_daddr_t d;
+
+ /* need to sort out how callers deal with failures first */
+ ASSERT(!(flags & XBF_TRYLOCK));
+
+ d = xfs_btree_ptr_to_daddr(cur, ptr);
+ *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+ mp->m_bsize, flags);
+
+ ASSERT(*bpp);
+ ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+ *block = XFS_BUF_TO_BLOCK(*bpp);
+ return 0;
+}
+
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int level,
+ int flags,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_daddr_t d;
+ int error;
+
+ /* need to sort out how callers deal with failures first */
+ ASSERT(!(flags & XBF_TRYLOCK));
+
+ d = xfs_btree_ptr_to_daddr(cur, ptr);
+ error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+ mp->m_bsize, flags, bpp);
+ if (error)
+ return error;
+
+ ASSERT(*bpp != NULL);
+ ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+ xfs_btree_set_refs(cur, *bpp);
+ *block = XFS_BUF_TO_BLOCK(*bpp);
+
+ error = xfs_btree_check_block(cur, *block, level, *bpp);
+ if (error)
+ xfs_trans_brelse(cur->bc_tp, *bpp);
+ return error;
+}
+
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *dst_key,
+ union xfs_btree_key *src_key,
+ int numkeys)
+{
+ ASSERT(numkeys >= 0);
+ memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *dst_rec,
+ union xfs_btree_rec *src_rec,
+ int numrecs)
+{
+ ASSERT(numrecs >= 0);
+ memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *dst_ptr,
+ union xfs_btree_ptr *src_ptr,
+ int numptrs)
+{
+ ASSERT(numptrs >= 0);
+ memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key,
+ int dir,
+ int numkeys)
+{
+ char *dst_key;
+
+ ASSERT(numkeys >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+ memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ int dir,
+ int numrecs)
+{
+ char *dst_rec;
+
+ ASSERT(numrecs >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+ memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int dir,
+ int numptrs)
+{
+ char *dst_ptr;
+
+ ASSERT(numptrs >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+ memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int first,
+ int last)
+{
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+ if (bp) {
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_key_offset(cur, first),
+ xfs_btree_key_offset(cur, last + 1) - 1);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+ xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int first,
+ int last)
+{
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_rec_offset(cur, first),
+ xfs_btree_rec_offset(cur, last + 1) - 1);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_buf *bp, /* buffer containing btree block */
+ int first, /* index of first pointer to log */
+ int last) /* index of last pointer to log */
+{
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+ if (bp) {
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ int level = xfs_btree_get_level(block);
+
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_ptr_offset(cur, first, level),
+ xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+ xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_buf *bp, /* buffer containing btree block */
+ int fields) /* mask of fields: XFS_BB_... */
+{
+ int first; /* first byte offset logged */
+ int last; /* last byte offset logged */
+ static const short soffsets[] = { /* table of offsets (short) */
+ offsetof(struct xfs_btree_block, bb_magic),
+ offsetof(struct xfs_btree_block, bb_level),
+ offsetof(struct xfs_btree_block, bb_numrecs),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+ XFS_BTREE_SBLOCK_LEN
+ };
+ static const short loffsets[] = { /* table of offsets (long) */
+ offsetof(struct xfs_btree_block, bb_magic),
+ offsetof(struct xfs_btree_block, bb_level),
+ offsetof(struct xfs_btree_block, bb_numrecs),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+ XFS_BTREE_LBLOCK_LEN
+ };
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+
+ if (bp) {
+ xfs_btree_offsets(fields,
+ (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ loffsets : soffsets,
+ XFS_BB_NUM_BITS, &first, &last);
+ xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+ xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int /* error */
+xfs_btree_increment(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block;
+ union xfs_btree_ptr ptr;
+ struct xfs_buf *bp;
+ int error; /* error return value */
+ int lev;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ ASSERT(level < cur->bc_nlevels);
+
+ /* Read-ahead to the right at this level. */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+ /* Get a pointer to the btree block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* We're done if we remain in the block after the increment. */
+ if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+ goto out1;
+
+ /* Fail if we just went off the right edge of the tree. */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &ptr))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, increment);
+
+ /*
+ * March up the tree incrementing pointers.
+ * Stop when we don't go off the right edge of a block.
+ */
+ for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+ block = xfs_btree_get_block(cur, lev, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, lev, bp);
+ if (error)
+ goto error0;
+#endif
+
+ if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+ break;
+
+ /* Read-ahead the right block for the next loop. */
+ xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+ }
+
+ /*
+ * If we went off the root then we are either seriously
+ * confused or have the tree root in an inode.
+ */
+ if (lev == cur->bc_nlevels) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ goto out0;
+ ASSERT(0);
+ error = EFSCORRUPTED;
+ goto error0;
+ }
+ ASSERT(lev < cur->bc_nlevels);
+
+ /*
+ * Now walk back down the tree, fixing up the cursor's buffer
+ * pointers and key numbers.
+ */
+ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+ union xfs_btree_ptr *ptrp;
+
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+ 0, &block, &bp);
+ if (error)
+ goto error0;
+
+ xfs_btree_setbuf(cur, lev, bp);
+ cur->bc_ptrs[lev] = 1;
+ }
+out1:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int /* error */
+xfs_btree_decrement(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block;
+ xfs_buf_t *bp;
+ int error; /* error return value */
+ int lev;
+ union xfs_btree_ptr ptr;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ ASSERT(level < cur->bc_nlevels);
+
+ /* Read-ahead to the left at this level. */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+
+ /* We're done if we remain in the block after the decrement. */
+ if (--cur->bc_ptrs[level] > 0)
+ goto out1;
+
+ /* Get a pointer to the btree block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* Fail if we just went off the left edge of the tree. */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+ if (xfs_btree_ptr_is_null(cur, &ptr))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, decrement);
+
+ /*
+ * March up the tree decrementing pointers.
+ * Stop when we don't go off the left edge of a block.
+ */
+ for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+ if (--cur->bc_ptrs[lev] > 0)
+ break;
+ /* Read-ahead the left block for the next loop. */
+ xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+ }
+
+ /*
+ * If we went off the root then we are seriously confused.
+ * or the root of the tree is in an inode.
+ */
+ if (lev == cur->bc_nlevels) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ goto out0;
+ ASSERT(0);
+ error = EFSCORRUPTED;
+ goto error0;
+ }
+ ASSERT(lev < cur->bc_nlevels);
+
+ /*
+ * Now walk back down the tree, fixing up the cursor's buffer
+ * pointers and key numbers.
+ */
+ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+ union xfs_btree_ptr *ptrp;
+
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+ 0, &block, &bp);
+ if (error)
+ goto error0;
+ xfs_btree_setbuf(cur, lev, bp);
+ cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+ }
+out1:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+STATIC int
+xfs_btree_lookup_get_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level in the btree */
+ union xfs_btree_ptr *pp, /* ptr to btree block */
+ struct xfs_btree_block **blkp) /* return btree block */
+{
+ struct xfs_buf *bp; /* buffer pointer for btree block */
+ int error = 0;
+
+ /* special case the root block if in an inode */
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1)) {
+ *blkp = xfs_btree_get_iroot(cur);
+ return 0;
+ }
+
+ /*
+ * If the old buffer at this level for the disk address we are
+ * looking for re-use it.
+ *
+ * Otherwise throw it away and get a new one.
+ */
+ bp = cur->bc_bufs[level];
+ if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+ *blkp = XFS_BUF_TO_BLOCK(bp);
+ return 0;
+ }
+
+ error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+ if (error)
+ return error;
+
+ xfs_btree_setbuf(cur, level, bp);
+ return 0;
+}
+
+/*
+ * Get current search key. For level 0 we don't actually have a key
+ * structure so we make one up from the record. For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+ struct xfs_btree_cur *cur,
+ int level,
+ int keyno,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *kp)
+{
+ if (level == 0) {
+ cur->bc_ops->init_key_from_rec(kp,
+ xfs_btree_rec_addr(cur, keyno, block));
+ return kp;
+ }
+
+ return xfs_btree_key_addr(cur, keyno, block);
+}
+
+/*
+ * Lookup the record. The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+int /* error */
+xfs_btree_lookup(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_lookup_t dir, /* <=, ==, or >= */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ __int64_t diff; /* difference for the current key */
+ int error; /* error return value */
+ int keyno; /* current key number */
+ int level; /* level in the btree */
+ union xfs_btree_ptr *pp; /* ptr to btree block */
+ union xfs_btree_ptr ptr; /* ptr to btree block */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, dir);
+
+ XFS_BTREE_STATS_INC(cur, lookup);
+
+ block = NULL;
+ keyno = 0;
+
+ /* initialise start pointer from cursor */
+ cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ pp = &ptr;
+
+ /*
+ * Iterate over each level in the btree, starting at the root.
+ * For each level above the leaves, find the key we need, based
+ * on the lookup record, then follow the corresponding block
+ * pointer down to the next level.
+ */
+ for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+ /* Get the block we need to do the lookup on. */
+ error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+ if (error)
+ goto error0;
+
+ if (diff == 0) {
+ /*
+ * If we already had a key match at a higher level, we
+ * know we need to use the first entry in this block.
+ */
+ keyno = 1;
+ } else {
+ /* Otherwise search this block. Do a binary search. */
+
+ int high; /* high entry number */
+ int low; /* low entry number */
+
+ /* Set low and high entry numbers, 1-based. */
+ low = 1;
+ high = xfs_btree_get_numrecs(block);
+ if (!high) {
+ /* Block is empty, must be an empty leaf. */
+ ASSERT(level == 0 && cur->bc_nlevels == 1);
+
+ cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ /* Binary search the block. */
+ while (low <= high) {
+ union xfs_btree_key key;
+ union xfs_btree_key *kp;
+
+ XFS_BTREE_STATS_INC(cur, compare);
+
+ /* keyno is average of low and high. */
+ keyno = (low + high) >> 1;
+
+ /* Get current search key */
+ kp = xfs_lookup_get_search_key(cur, level,
+ keyno, block, &key);
+
+ /*
+ * Compute difference to get next direction:
+ * - less than, move right
+ * - greater than, move left
+ * - equal, we're done
+ */
+ diff = cur->bc_ops->key_diff(cur, kp);
+ if (diff < 0)
+ low = keyno + 1;
+ else if (diff > 0)
+ high = keyno - 1;
+ else
+ break;
+ }
+ }
+
+ /*
+ * If there are more levels, set up for the next level
+ * by getting the block number and filling in the cursor.
+ */
+ if (level > 0) {
+ /*
+ * If we moved left, need the previous key number,
+ * unless there isn't one.
+ */
+ if (diff > 0 && --keyno < 1)
+ keyno = 1;
+ pp = xfs_btree_ptr_addr(cur, keyno, block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, pp, 0, level);
+ if (error)
+ goto error0;
+#endif
+ cur->bc_ptrs[level] = keyno;
+ }
+ }
+
+ /* Done with the search. See if we need to adjust the results. */
+ if (dir != XFS_LOOKUP_LE && diff < 0) {
+ keyno++;
+ /*
+ * If ge search and we went off the end of the block, but it's
+ * not the last block, we're in the wrong block.
+ */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (dir == XFS_LOOKUP_GE &&
+ keyno > xfs_btree_get_numrecs(block) &&
+ !xfs_btree_ptr_is_null(cur, &ptr)) {
+ int i;
+
+ cur->bc_ptrs[0] = keyno;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+ }
+ } else if (dir == XFS_LOOKUP_LE && diff > 0)
+ keyno--;
+ cur->bc_ptrs[0] = keyno;
+
+ /* Return if we succeeded or not. */
+ if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+ *stat = 0;
+ else if (dir != XFS_LOOKUP_EQ || diff == 0)
+ *stat = 1;
+ else
+ *stat = 0;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *keyp,
+ int level)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ union xfs_btree_key *kp;
+ int ptr;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+
+ ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+
+ /*
+ * Go up the tree from this level toward the root.
+ * At each level, update the key value to the value input.
+ * Stop when we reach a level where the cursor isn't pointing
+ * at the first entry in the block.
+ */
+ for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+ int error;
+#endif
+ block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+#endif
+ ptr = cur->bc_ptrs[level];
+ kp = xfs_btree_key_addr(cur, ptr, block);
+ xfs_btree_copy_keys(cur, kp, keyp, 1);
+ xfs_btree_log_keys(cur, bp, ptr, ptr);
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+}
+
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ int error;
+ int ptr;
+ union xfs_btree_rec *rp;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGR(cur, rec);
+
+ /* Pick up the current block. */
+ block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, 0, bp);
+ if (error)
+ goto error0;
+#endif
+ /* Get the address of the rec to be updated. */
+ ptr = cur->bc_ptrs[0];
+ rp = xfs_btree_rec_addr(cur, ptr, block);
+
+ /* Fill in the new contents and log them. */
+ xfs_btree_copy_recs(cur, rp, rec, 1);
+ xfs_btree_log_recs(cur, bp, ptr, ptr);
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, 0)) {
+ cur->bc_ops->update_lastrec(cur, block, rec,
+ ptr, LASTREC_UPDATE);
+ }
+
+ /* Updating first rec in leaf. Pass new key value up to our parent. */
+ if (ptr == 1) {
+ union xfs_btree_key key;
+
+ cur->bc_ops->init_key_from_rec(&key, rec);
+ error = xfs_btree_updkey(cur, &key, 1);
+ if (error)
+ goto error0;
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int /* error */
+xfs_btree_lshift(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ union xfs_btree_key key; /* btree key */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ int lrecs; /* left record count */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ int rrecs; /* right record count */
+ union xfs_btree_ptr lptr; /* left btree pointer */
+ union xfs_btree_key *rkp = NULL; /* right btree key */
+ union xfs_btree_ptr *rpp = NULL; /* right address pointer */
+ union xfs_btree_rec *rrp = NULL; /* right record pointer */
+ int error; /* error return value */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1)
+ goto out0;
+
+ /* Set up variables for this block as "right". */
+ right = xfs_btree_get_block(cur, level, &rbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, right, level, rbp);
+ if (error)
+ goto error0;
+#endif
+
+ /* If we've got no left sibling then we can't shift an entry left. */
+ xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ if (xfs_btree_ptr_is_null(cur, &lptr))
+ goto out0;
+
+ /*
+ * If the cursor entry is the one that would be moved, don't
+ * do it... it's too complicated.
+ */
+ if (cur->bc_ptrs[level] <= 1)
+ goto out0;
+
+ /* Set up the left neighbor as "left". */
+ error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+ if (error)
+ goto error0;
+
+ /* If it's full, it can't take another entry. */
+ lrecs = xfs_btree_get_numrecs(left);
+ if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+ goto out0;
+
+ rrecs = xfs_btree_get_numrecs(right);
+
+ /*
+ * We add one entry to the left side and remove one for the right side.
+ * Account for it here, the changes will be updated on disk and logged
+ * later.
+ */
+ lrecs++;
+ rrecs--;
+
+ XFS_BTREE_STATS_INC(cur, lshift);
+ XFS_BTREE_STATS_ADD(cur, moves, 1);
+
+ /*
+ * If non-leaf, copy a key and a ptr to the left block.
+ * Log the changes to the left block.
+ */
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+
+ lkp = xfs_btree_key_addr(cur, lrecs, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+
+ lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, rpp, 0, level);
+ if (error)
+ goto error0;
+#endif
+ xfs_btree_copy_keys(cur, lkp, rkp, 1);
+ xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+
+ xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+ xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+
+ ASSERT(cur->bc_ops->keys_inorder(cur,
+ xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, lrecs, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_copy_recs(cur, lrp, rrp, 1);
+ xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+
+ ASSERT(cur->bc_ops->recs_inorder(cur,
+ xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+ }
+
+ xfs_btree_set_numrecs(left, lrecs);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+ xfs_btree_set_numrecs(right, rrecs);
+ xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+ /*
+ * Slide the contents of right down one entry.
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+ if (level > 0) {
+ /* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+ int i; /* loop index */
+
+ for (i = 0; i < rrecs; i++) {
+ error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+ if (error)
+ goto error0;
+ }
+#endif
+ xfs_btree_shift_keys(cur,
+ xfs_btree_key_addr(cur, 2, right),
+ -1, rrecs);
+ xfs_btree_shift_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 2, right),
+ -1, rrecs);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+ } else {
+ /* It's a leaf. operate on records */
+ xfs_btree_shift_recs(cur,
+ xfs_btree_rec_addr(cur, 2, right),
+ -1, rrecs);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+ /*
+ * If it's the first record in the block, we'll need a key
+ * structure to pass up to the next level (updkey).
+ */
+ cur->bc_ops->init_key_from_rec(&key,
+ xfs_btree_rec_addr(cur, 1, right));
+ rkp = &key;
+ }
+
+ /* Update the parent key values of right. */
+ error = xfs_btree_updkey(cur, rkp, level + 1);
+ if (error)
+ goto error0;
+
+ /* Slide the cursor value left one. */
+ cur->bc_ptrs[level]--;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int /* error */
+xfs_btree_rshift(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ union xfs_btree_key key; /* btree key */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ struct xfs_btree_cur *tcur; /* temporary btree cursor */
+ union xfs_btree_ptr rptr; /* right block pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ int rrecs; /* right record count */
+ int lrecs; /* left record count */
+ int error; /* error return value */
+ int i; /* loop counter */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1))
+ goto out0;
+
+ /* Set up variables for this block as "left". */
+ left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+
+ /* If we've got no right sibling then we can't shift an entry right. */
+ xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &rptr))
+ goto out0;
+
+ /*
+ * If the cursor entry is the one that would be moved, don't
+ * do it... it's too complicated.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ if (cur->bc_ptrs[level] >= lrecs)
+ goto out0;
+
+ /* Set up the right neighbor as "right". */
+ error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /* If it's full, it can't take another entry. */
+ rrecs = xfs_btree_get_numrecs(right);
+ if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, rshift);
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+ /*
+ * Make a hole at the start of the right neighbor block, then
+ * copy the last left block entry to the hole.
+ */
+ if (level > 0) {
+ /* It's a nonleaf. make a hole in the keys and ptrs */
+ union xfs_btree_key *lkp;
+ union xfs_btree_ptr *lpp;
+ union xfs_btree_ptr *rpp;
+
+ lkp = xfs_btree_key_addr(cur, lrecs, left);
+ lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+ for (i = rrecs - 1; i >= 0; i--) {
+ error = xfs_btree_check_ptr(cur, rpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+
+ xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+ xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, lpp, 0, level);
+ if (error)
+ goto error0;
+#endif
+
+ /* Now put the new data in, and log it. */
+ xfs_btree_copy_keys(cur, rkp, lkp, 1);
+ xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+
+ ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+ xfs_btree_key_addr(cur, 2, right)));
+ } else {
+ /* It's a leaf. make a hole in the records */
+ union xfs_btree_rec *lrp;
+ union xfs_btree_rec *rrp;
+
+ lrp = xfs_btree_rec_addr(cur, lrecs, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+
+ /* Now put the new data in, and log it. */
+ xfs_btree_copy_recs(cur, rrp, lrp, 1);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+
+ cur->bc_ops->init_key_from_rec(&key, rrp);
+ rkp = &key;
+
+ ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+ xfs_btree_rec_addr(cur, 2, right)));
+ }
+
+ /*
+ * Decrement and log left's numrecs, bump and log right's numrecs.
+ */
+ xfs_btree_set_numrecs(left, --lrecs);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+ xfs_btree_set_numrecs(right, ++rrecs);
+ xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+ /*
+ * Using a temporary cursor, update the parent key values of the
+ * block on the right.
+ */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+ i = xfs_btree_lastrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_increment(tcur, level, &i);
+ if (error)
+ goto error1;
+
+ error = xfs_btree_updkey(tcur, rkp, level + 1);
+ if (error)
+ goto error1;
+
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+
+error1:
+ XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int /* error */
+xfs_btree_split(
+ struct xfs_btree_cur *cur,
+ int level,
+ union xfs_btree_ptr *ptrp,
+ union xfs_btree_key *key,
+ struct xfs_btree_cur **curp,
+ int *stat) /* success/failure */
+{
+ union xfs_btree_ptr lptr; /* left sibling block ptr */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ union xfs_btree_ptr rptr; /* right sibling block ptr */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ union xfs_btree_ptr rrptr; /* right-right sibling ptr */
+ struct xfs_buf *rrbp; /* right-right buffer pointer */
+ struct xfs_btree_block *rrblock; /* right-right btree block */
+ int lrecs;
+ int rrecs;
+ int src_index;
+ int error; /* error return value */
+#ifdef DEBUG
+ int i;
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+
+ XFS_BTREE_STATS_INC(cur, split);
+
+ /* Set up left block (current one). */
+ left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+
+ xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0)
+ goto out0;
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Set up the new block as "right". */
+ error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /* Fill in the btree header for the new right block. */
+ xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+
+ /*
+ * Split the entries between the old and the new block evenly.
+ * Make sure that if there's an odd number of entries now, that
+ * each new block will have the same number of entries.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ rrecs = lrecs / 2;
+ if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+ rrecs++;
+ src_index = (lrecs - rrecs + 1);
+
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+ /*
+ * Copy btree block entries from the left block over to the
+ * new block, the right. Update the right block and log the
+ * changes.
+ */
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ union xfs_btree_ptr *rpp; /* right address pointer */
+
+ lkp = xfs_btree_key_addr(cur, src_index, left);
+ lpp = xfs_btree_ptr_addr(cur, src_index, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+ for (i = src_index; i < rrecs; i++) {
+ error = xfs_btree_check_ptr(cur, lpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+
+ xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+ xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+
+ /* Grab the keys to the entries moved to the right block */
+ xfs_btree_copy_keys(cur, key, rkp, 1);
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+ union xfs_btree_rec *rrp; /* right record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, src_index, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+ cur->bc_ops->init_key_from_rec(key,
+ xfs_btree_rec_addr(cur, 1, right));
+ }
+
+
+ /*
+ * Find the left block number by looking in the buffer.
+ * Adjust numrecs, sibling pointers.
+ */
+ xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+ xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+ xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+
+ lrecs -= rrecs;
+ xfs_btree_set_numrecs(left, lrecs);
+ xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
+ xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+ /*
+ * If there's a block to the new block's right, make that block
+ * point back to right instead of to left.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+ error = xfs_btree_read_buf_block(cur, &rrptr, level,
+ 0, &rrblock, &rrbp);
+ if (error)
+ goto error0;
+ xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+ xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+ }
+ /*
+ * If the cursor is really in the right block, move it there.
+ * If it's just pointing past the last entry in left, then we'll
+ * insert there, so don't change anything in that case.
+ */
+ if (cur->bc_ptrs[level] > lrecs + 1) {
+ xfs_btree_setbuf(cur, level, rbp);
+ cur->bc_ptrs[level] -= lrecs;
+ }
+ /*
+ * If there are more levels, we'll need another cursor which refers
+ * the right block, no matter where this cursor was.
+ */
+ if (level + 1 < cur->bc_nlevels) {
+ error = xfs_btree_dup_cursor(cur, curp);
+ if (error)
+ goto error0;
+ (*curp)->bc_ptrs[level + 1]++;
+ }
+ *ptrp = rptr;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int /* error */
+xfs_btree_new_iroot(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int *logflags, /* logging flags for inode */
+ int *stat) /* return status - 0 fail */
+{
+ struct xfs_buf *cbp; /* buffer for cblock */
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_btree_block *cblock; /* child btree block */
+ union xfs_btree_key *ckp; /* child key pointer */
+ union xfs_btree_ptr *cpp; /* child ptr pointer */
+ union xfs_btree_key *kp; /* pointer to btree key */
+ union xfs_btree_ptr *pp; /* pointer to block addr */
+ union xfs_btree_ptr nptr; /* new block addr */
+ int level; /* btree level */
+ int error; /* error return code */
+#ifdef DEBUG
+ int i; /* loop counter */
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_STATS_INC(cur, newroot);
+
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+ level = cur->bc_nlevels - 1;
+
+ block = xfs_btree_get_iroot(cur);
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+ }
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Copy the root into a real block. */
+ error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+ if (error)
+ goto error0;
+
+ memcpy(cblock, block, xfs_btree_block_len(cur));
+
+ be16_add_cpu(&block->bb_level, 1);
+ xfs_btree_set_numrecs(block, 1);
+ cur->bc_nlevels++;
+ cur->bc_ptrs[level + 1] = 1;
+
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+ for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+ error = xfs_btree_check_ptr(cur, pp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+ xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+ if (error)
+ goto error0;
+#endif
+ xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+
+ xfs_iroot_realloc(cur->bc_private.b.ip,
+ 1 - xfs_btree_get_numrecs(cblock),
+ cur->bc_private.b.whichfork);
+
+ xfs_btree_setbuf(cur, level, cbp);
+
+ /*
+ * Do all this logging at the end so that
+ * the root is at the right level.
+ */
+ xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+ xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+
+ *logflags |=
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
+ *stat = 1;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int /* error */
+xfs_btree_new_root(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* one half of the old root block */
+ struct xfs_buf *bp; /* buffer containing block */
+ int error; /* error return value */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ struct xfs_buf *nbp; /* new (root) buffer */
+ struct xfs_btree_block *new; /* new (root) btree block */
+ int nptr; /* new value for key index, 1 or 2 */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ union xfs_btree_ptr rptr;
+ union xfs_btree_ptr lptr;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_STATS_INC(cur, newroot);
+
+ /* initialise our start point from the cursor */
+ cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0)
+ goto out0;
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Set up the new block. */
+ error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+ if (error)
+ goto error0;
+
+ /* Set the root in the holding structure increasing the level by 1. */
+ cur->bc_ops->set_root(cur, &lptr, 1);
+
+ /*
+ * At the previous root level there are now two blocks: the old root,
+ * and the new block generated when it was split. We don't know which
+ * one the cursor is pointing at, so we set up variables "left" and
+ * "right" for each case.
+ */
+ block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+ if (error)
+ goto error0;
+#endif
+
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+ /* Our block is left, pick up the right block. */
+ lbp = bp;
+ xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+ left = block;
+ error = xfs_btree_read_buf_block(cur, &rptr,
+ cur->bc_nlevels - 1, 0, &right, &rbp);
+ if (error)
+ goto error0;
+ bp = rbp;
+ nptr = 1;
+ } else {
+ /* Our block is right, pick up the left block. */
+ rbp = bp;
+ xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+ right = block;
+ xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ error = xfs_btree_read_buf_block(cur, &lptr,
+ cur->bc_nlevels - 1, 0, &left, &lbp);
+ if (error)
+ goto error0;
+ bp = lbp;
+ nptr = 2;
+ }
+ /* Fill in the new block's btree header and log it. */
+ xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+ xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+ ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+ !xfs_btree_ptr_is_null(cur, &rptr));
+
+ /* Fill in the key data in the new root. */
+ if (xfs_btree_get_level(left) > 0) {
+ xfs_btree_copy_keys(cur,
+ xfs_btree_key_addr(cur, 1, new),
+ xfs_btree_key_addr(cur, 1, left), 1);
+ xfs_btree_copy_keys(cur,
+ xfs_btree_key_addr(cur, 2, new),
+ xfs_btree_key_addr(cur, 1, right), 1);
+ } else {
+ cur->bc_ops->init_key_from_rec(
+ xfs_btree_key_addr(cur, 1, new),
+ xfs_btree_rec_addr(cur, 1, left));
+ cur->bc_ops->init_key_from_rec(
+ xfs_btree_key_addr(cur, 2, new),
+ xfs_btree_rec_addr(cur, 1, right));
+ }
+ xfs_btree_log_keys(cur, nbp, 1, 2);
+
+ /* Fill in the pointer data in the new root. */
+ xfs_btree_copy_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+ xfs_btree_copy_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+ xfs_btree_log_ptrs(cur, nbp, 1, 2);
+
+ /* Fix up the cursor. */
+ xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+ cur->bc_ptrs[cur->bc_nlevels] = nptr;
+ cur->bc_nlevels++;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+}
+
+STATIC int
+xfs_btree_make_block_unfull(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* btree level */
+ int numrecs,/* # of recs in block */
+ int *oindex,/* old tree index */
+ int *index, /* new tree index */
+ union xfs_btree_ptr *nptr, /* new btree ptr */
+ struct xfs_btree_cur **ncur, /* new btree cursor */
+ union xfs_btree_rec *nrec, /* new record */
+ int *stat)
+{
+ union xfs_btree_key key; /* new btree key value */
+ int error = 0;
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1) {
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+
+ if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+ /* A root block that can be made bigger. */
+
+ xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+ } else {
+ /* A root block that needs replacing */
+ int logflags = 0;
+
+ error = xfs_btree_new_iroot(cur, &logflags, stat);
+ if (error || *stat == 0)
+ return error;
+
+ xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+ }
+
+ return 0;
+ }
+
+ /* First, try shifting an entry to the right neighbor. */
+ error = xfs_btree_rshift(cur, level, stat);
+ if (error || *stat)
+ return error;
+
+ /* Next, try shifting an entry to the left neighbor. */
+ error = xfs_btree_lshift(cur, level, stat);
+ if (error)
+ return error;
+
+ if (*stat) {
+ *oindex = *index = cur->bc_ptrs[level];
+ return 0;
+ }
+
+ /*
+ * Next, try splitting the current block in half.
+ *
+ * If this works we have to re-set our variables because we
+ * could be in a different block now.
+ */
+ error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+ if (error || *stat == 0)
+ return error;
+
+
+ *index = cur->bc_ptrs[level];
+ cur->bc_ops->init_rec_from_key(&key, nrec);
+ return 0;
+}
+
+/*
+ * Insert one record/level. Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level to insert record at */
+ union xfs_btree_ptr *ptrp, /* i/o: block number inserted */
+ union xfs_btree_rec *recp, /* i/o: record data inserted */
+ struct xfs_btree_cur **curp, /* output: new cursor replacing cur */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_buf *bp; /* buffer for block */
+ union xfs_btree_key key; /* btree key */
+ union xfs_btree_ptr nptr; /* new block ptr */
+ struct xfs_btree_cur *ncur; /* new btree cursor */
+ union xfs_btree_rec nrec; /* new record count */
+ int optr; /* old key/record index */
+ int ptr; /* key/record index */
+ int numrecs;/* number of records */
+ int error; /* error return value */
+#ifdef DEBUG
+ int i;
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+
+ ncur = NULL;
+
+ /*
+ * If we have an external root pointer, and we've made it to the
+ * root level, allocate a new root block and we're done.
+ */
+ if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level >= cur->bc_nlevels)) {
+ error = xfs_btree_new_root(cur, stat);
+ xfs_btree_set_ptr_null(cur, ptrp);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return error;
+ }
+
+ /* If we're off the left edge, return failure. */
+ ptr = cur->bc_ptrs[level];
+ if (ptr == 0) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ /* Make a key out of the record data to be inserted, and save it. */
+ cur->bc_ops->init_key_from_rec(&key, recp);
+
+ optr = ptr;
+
+ XFS_BTREE_STATS_INC(cur, insrec);
+
+ /* Get pointers to the btree buffer and block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+
+ /* Check that the new entry is being inserted in the right place. */
+ if (ptr <= numrecs) {
+ if (level == 0) {
+ ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+ xfs_btree_rec_addr(cur, ptr, block)));
+ } else {
+ ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+ xfs_btree_key_addr(cur, ptr, block)));
+ }
+ }
+#endif
+
+ /*
+ * If the block is full, we can't insert the new entry until we
+ * make the block un-full.
+ */
+ xfs_btree_set_ptr_null(cur, &nptr);
+ if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+ error = xfs_btree_make_block_unfull(cur, level, numrecs,
+ &optr, &ptr, &nptr, &ncur, &nrec, stat);
+ if (error || *stat == 0)
+ goto error0;
+ }
+
+ /*
+ * The current block may have changed if the block was
+ * previously full and we have just made space in it.
+ */
+ block = xfs_btree_get_block(cur, level, &bp);
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ return error;
+#endif
+
+ /*
+ * At this point we know there's room for our new entry in the block
+ * we're pointing at.
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+
+ if (level > 0) {
+ /* It's a nonleaf. make a hole in the keys and ptrs */
+ union xfs_btree_key *kp;
+ union xfs_btree_ptr *pp;
+
+ kp = xfs_btree_key_addr(cur, ptr, block);
+ pp = xfs_btree_ptr_addr(cur, ptr, block);
+
+#ifdef DEBUG
+ for (i = numrecs - ptr; i >= 0; i--) {
+ error = xfs_btree_check_ptr(cur, pp, i, level);
+ if (error)
+ return error;
+ }
+#endif
+
+ xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+ xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+ if (error)
+ goto error0;
+#endif
+
+ /* Now put the new data in, bump numrecs and log it. */
+ xfs_btree_copy_keys(cur, kp, &key, 1);
+ xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+ numrecs++;
+ xfs_btree_set_numrecs(block, numrecs);
+ xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+ xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+ if (ptr < numrecs) {
+ ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+ xfs_btree_key_addr(cur, ptr + 1, block)));
+ }
+#endif
+ } else {
+ /* It's a leaf. make a hole in the records */
+ union xfs_btree_rec *rp;
+
+ rp = xfs_btree_rec_addr(cur, ptr, block);
+
+ xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+
+ /* Now put the new data in, bump numrecs and log it. */
+ xfs_btree_copy_recs(cur, rp, recp, 1);
+ xfs_btree_set_numrecs(block, ++numrecs);
+ xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+ if (ptr < numrecs) {
+ ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+ xfs_btree_rec_addr(cur, ptr + 1, block)));
+ }
+#endif
+ }
+
+ /* Log the new number of records in the btree header. */
+ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+ /* If we inserted at the start of a block, update the parents' keys. */
+ if (optr == 1) {
+ error = xfs_btree_updkey(cur, &key, level + 1);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, level)) {
+ cur->bc_ops->update_lastrec(cur, block, recp,
+ ptr, LASTREC_INSREC);
+ }
+
+ /*
+ * Return the new block number, if any.
+ * If there is one, give back a record value and a cursor too.
+ */
+ *ptrp = nptr;
+ if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+ *recp = nrec;
+ *curp = ncur;
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor. All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+ struct xfs_btree_cur *cur,
+ int *stat)
+{
+ int error; /* error return value */
+ int i; /* result value, 0 for failure */
+ int level; /* current level number in btree */
+ union xfs_btree_ptr nptr; /* new block number (split result) */
+ struct xfs_btree_cur *ncur; /* new cursor (split result) */
+ struct xfs_btree_cur *pcur; /* previous level's cursor */
+ union xfs_btree_rec rec; /* record to insert */
+
+ level = 0;
+ ncur = NULL;
+ pcur = cur;
+
+ xfs_btree_set_ptr_null(cur, &nptr);
+ cur->bc_ops->init_rec_from_cur(cur, &rec);
+
+ /*
+ * Loop going up the tree, starting at the leaf level.
+ * Stop when we don't get a split block, that must mean that
+ * the insert is finished with this level.
+ */
+ do {
+ /*
+ * Insert nrec/nptr into this level of the tree.
+ * Note if we fail, nptr will be null.
+ */
+ error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+ if (error) {
+ if (pcur != cur)
+ xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+ goto error0;
+ }
+
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ level++;
+
+ /*
+ * See if the cursor we just used is trash.
+ * Can't trash the caller's cursor, but otherwise we should
+ * if ncur is a new cursor or we're about to be done.
+ */
+ if (pcur != cur &&
+ (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+ /* Save the state from the cursor before we trash it */
+ if (cur->bc_ops->update_cursor)
+ cur->bc_ops->update_cursor(pcur, cur);
+ cur->bc_nlevels = pcur->bc_nlevels;
+ xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+ }
+ /* If we got a new cursor, switch to it. */
+ if (ncur) {
+ pcur = ncur;
+ ncur = NULL;
+ }
+ } while (!xfs_btree_ptr_is_null(cur, &nptr));
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = i;
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block. But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+STATIC int
+xfs_btree_kill_iroot(
+ struct xfs_btree_cur *cur)
+{
+ int whichfork = cur->bc_private.b.whichfork;
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_block *block;
+ struct xfs_btree_block *cblock;
+ union xfs_btree_key *kp;
+ union xfs_btree_key *ckp;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_ptr *cpp;
+ struct xfs_buf *cbp;
+ int level;
+ int index;
+ int numrecs;
+#ifdef DEBUG
+ union xfs_btree_ptr ptr;
+ int i;
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_nlevels > 1);
+
+ /*
+ * Don't deal with the root block needs to be a leaf case.
+ * We're just going to turn the thing back into extents anyway.
+ */
+ level = cur->bc_nlevels - 1;
+ if (level == 1)
+ goto out0;
+
+ /*
+ * Give up if the root has multiple children.
+ */
+ block = xfs_btree_get_iroot(cur);
+ if (xfs_btree_get_numrecs(block) != 1)
+ goto out0;
+
+ cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+ numrecs = xfs_btree_get_numrecs(cblock);
+
+ /*
+ * Only do this if the next level will fit.
+ * Then the data must be copied up to the inode,
+ * instead of freeing the root you free the next level.
+ */
+ if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, killroot);
+
+#ifdef DEBUG
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+
+ index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+ if (index) {
+ xfs_iroot_realloc(cur->bc_private.b.ip, index,
+ cur->bc_private.b.whichfork);
+ block = ifp->if_broot;
+ }
+
+ be16_add_cpu(&block->bb_numrecs, index);
+ ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+ for (i = 0; i < numrecs; i++) {
+ int error;
+
+ error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+ }
+#endif
+ xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+ cur->bc_ops->free_block(cur, cbp);
+ XFS_BTREE_STATS_INC(cur, free);
+
+ cur->bc_bufs[level - 1] = NULL;
+ be16_add_cpu(&block->bb_level, -1);
+ xfs_trans_log_inode(cur->bc_tp, ip,
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ cur->bc_nlevels--;
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+}
+
+/*
+ * Kill the current root node, and replace it with it's only child node.
+ */
+STATIC int
+xfs_btree_kill_root(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int level,
+ union xfs_btree_ptr *newroot)
+{
+ int error;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_STATS_INC(cur, killroot);
+
+ /*
+ * Update the root pointer, decreasing the level by 1 and then
+ * free the old root.
+ */
+ cur->bc_ops->set_root(cur, newroot, -1);
+
+ error = cur->bc_ops->free_block(cur, bp);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+
+ XFS_BTREE_STATS_INC(cur, free);
+
+ cur->bc_bufs[level] = NULL;
+ cur->bc_ra[level] = 0;
+ cur->bc_nlevels--;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+}
+
+STATIC int
+xfs_btree_dec_cursor(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat)
+{
+ int error;
+ int i;
+
+ if (level > 0) {
+ error = xfs_btree_decrement(cur, level, &i);
+ if (error)
+ return error;
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+}
+
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int /* error */
+xfs_btree_delrec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level removing record from */
+ int *stat) /* fail/done/go-on */
+{
+ struct xfs_btree_block *block; /* btree block */
+ union xfs_btree_ptr cptr; /* current block ptr */
+ struct xfs_buf *bp; /* buffer for block */
+ int error; /* error return value */
+ int i; /* loop counter */
+ union xfs_btree_key key; /* storage for keyp */
+ union xfs_btree_key *keyp = &key; /* passed to the next level */
+ union xfs_btree_ptr lptr; /* left sibling block ptr */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ int lrecs = 0; /* left record count */
+ int ptr; /* key/record index */
+ union xfs_btree_ptr rptr; /* right sibling block ptr */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ struct xfs_btree_block *rrblock; /* right-right btree block */
+ struct xfs_buf *rrbp; /* right-right buffer pointer */
+ int rrecs = 0; /* right record count */
+ struct xfs_btree_cur *tcur; /* temporary btree cursor */
+ int numrecs; /* temporary numrec count */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ tcur = NULL;
+
+ /* Get the index of the entry being deleted, check for nothing there. */
+ ptr = cur->bc_ptrs[level];
+ if (ptr == 0) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ /* Get the buffer & block containing the record or key/ptr. */
+ block = xfs_btree_get_block(cur, level, &bp);
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* Fail if we're off the end of the block. */
+ if (ptr > numrecs) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ XFS_BTREE_STATS_INC(cur, delrec);
+ XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+
+ /* Excise the entries being deleted. */
+ if (level > 0) {
+ /* It's a nonleaf. operate on keys and ptrs */
+ union xfs_btree_key *lkp;
+ union xfs_btree_ptr *lpp;
+
+ lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+ lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+
+#ifdef DEBUG
+ for (i = 0; i < numrecs - ptr; i++) {
+ error = xfs_btree_check_ptr(cur, lpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+
+ if (ptr < numrecs) {
+ xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+ xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+ xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+ xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+ }
+
+ /*
+ * If it's the first record in the block, we'll need to pass a
+ * key up to the next level (updkey).
+ */
+ if (ptr == 1)
+ keyp = xfs_btree_key_addr(cur, 1, block);
+ } else {
+ /* It's a leaf. operate on records */
+ if (ptr < numrecs) {
+ xfs_btree_shift_recs(cur,
+ xfs_btree_rec_addr(cur, ptr + 1, block),
+ -1, numrecs - ptr);
+ xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+ }
+
+ /*
+ * If it's the first record in the block, we'll need a key
+ * structure to pass up to the next level (updkey).
+ */
+ if (ptr == 1) {
+ cur->bc_ops->init_key_from_rec(&key,
+ xfs_btree_rec_addr(cur, 1, block));
+ keyp = &key;
+ }
+ }
+
+ /*
+ * Decrement and log the number of entries in the block.
+ */
+ xfs_btree_set_numrecs(block, --numrecs);
+ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, level)) {
+ cur->bc_ops->update_lastrec(cur, block, NULL,
+ ptr, LASTREC_DELREC);
+ }
+
+ /*
+ * We're at the root level. First, shrink the root block in-memory.
+ * Try to get rid of the next level down. If we can't then there's
+ * nothing left to do.
+ */
+ if (level == cur->bc_nlevels - 1) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+ cur->bc_private.b.whichfork);
+
+ error = xfs_btree_kill_iroot(cur);
+ if (error)
+ goto error0;
+
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ *stat = 1;
+ return 0;
+ }
+
+ /*
+ * If this is the root level, and there's only one entry left,
+ * and it's NOT the leaf level, then we can get rid of this
+ * level.
+ */
+ if (numrecs == 1 && level > 0) {
+ union xfs_btree_ptr *pp;
+ /*
+ * pp is still set to the first pointer in the block.
+ * Make it the new root of the btree.
+ */
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ error = xfs_btree_kill_root(cur, bp, level, pp);
+ if (error)
+ goto error0;
+ } else if (level > 0) {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ }
+ *stat = 1;
+ return 0;
+ }
+
+ /*
+ * If we deleted the leftmost entry in the block, update the
+ * key values above us in the tree.
+ */
+ if (ptr == 1) {
+ error = xfs_btree_updkey(cur, keyp, level + 1);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * If the number of records remaining in the block is at least
+ * the minimum, we're done.
+ */
+ if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+
+ /*
+ * Otherwise, we have to move some records around to keep the
+ * tree balanced. Look at the left and right sibling blocks to
+ * see if we can re-balance by moving only one record.
+ */
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ /*
+ * One child of root, need to get a chance to copy its contents
+ * into the root and delete it. Can't go up to next level,
+ * there's nothing to delete there.
+ */
+ if (xfs_btree_ptr_is_null(cur, &rptr) &&
+ xfs_btree_ptr_is_null(cur, &lptr) &&
+ level == cur->bc_nlevels - 2) {
+ error = xfs_btree_kill_iroot(cur);
+ if (!error)
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+ }
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+ !xfs_btree_ptr_is_null(cur, &lptr));
+
+ /*
+ * Duplicate the cursor so our btree manipulations here won't
+ * disrupt the next level up.
+ */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+
+ /*
+ * If there's a right sibling, see if it's ok to shift an entry
+ * out of it.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+ /*
+ * Move the temp cursor to the last entry in the next block.
+ * Actually any entry but the first would suffice.
+ */
+ i = xfs_btree_lastrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_increment(tcur, level, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ i = xfs_btree_lastrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ /* Grab a pointer to the block. */
+ right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(tcur, right, level, rbp);
+ if (error)
+ goto error0;
+#endif
+ /* Grab the current block number, for future use. */
+ xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+
+ /*
+ * If right block is full enough so that removing one entry
+ * won't make it too empty, and left-shifting an entry out
+ * of right to us works, we're done.
+ */
+ if (xfs_btree_get_numrecs(right) - 1 >=
+ cur->bc_ops->get_minrecs(tcur, level)) {
+ error = xfs_btree_lshift(tcur, level, &i);
+ if (error)
+ goto error0;
+ if (i) {
+ ASSERT(xfs_btree_get_numrecs(block) >=
+ cur->bc_ops->get_minrecs(tcur, level));
+
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+ }
+
+ /*
+ * Otherwise, grab the number of records in right for
+ * future reference, and fix up the temp cursor to point
+ * to our block again (last record).
+ */
+ rrecs = xfs_btree_get_numrecs(right);
+ if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+ i = xfs_btree_firstrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_decrement(tcur, level, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ }
+ }
+
+ /*
+ * If there's a left sibling, see if it's ok to shift an entry
+ * out of it.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+ /*
+ * Move the temp cursor to the first entry in the
+ * previous block.
+ */
+ i = xfs_btree_firstrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_decrement(tcur, level, &i);
+ if (error)
+ goto error0;
+ i = xfs_btree_firstrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ /* Grab a pointer to the block. */
+ left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+ /* Grab the current block number, for future use. */
+ xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+
+ /*
+ * If left block is full enough so that removing one entry
+ * won't make it too empty, and right-shifting an entry out
+ * of left to us works, we're done.
+ */
+ if (xfs_btree_get_numrecs(left) - 1 >=
+ cur->bc_ops->get_minrecs(tcur, level)) {
+ error = xfs_btree_rshift(tcur, level, &i);
+ if (error)
+ goto error0;
+ if (i) {
+ ASSERT(xfs_btree_get_numrecs(block) >=
+ cur->bc_ops->get_minrecs(tcur, level));
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+ if (level == 0)
+ cur->bc_ptrs[0]++;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+ }
+ }
+
+ /*
+ * Otherwise, grab the number of records in right for
+ * future reference.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ }
+
+ /* Delete the temp cursor, we're done with it. */
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+
+ /* If here, we need to do a join to keep the tree balanced. */
+ ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+
+ if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+ lrecs + xfs_btree_get_numrecs(block) <=
+ cur->bc_ops->get_maxrecs(cur, level)) {
+ /*
+ * Set "right" to be the starting block,
+ * "left" to be the left neighbor.
+ */
+ rptr = cptr;
+ right = block;
+ rbp = bp;
+ error = xfs_btree_read_buf_block(cur, &lptr, level,
+ 0, &left, &lbp);
+ if (error)
+ goto error0;
+
+ /*
+ * If that won't work, see if we can join with the right neighbor block.
+ */
+ } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+ rrecs + xfs_btree_get_numrecs(block) <=
+ cur->bc_ops->get_maxrecs(cur, level)) {
+ /*
+ * Set "left" to be the starting block,
+ * "right" to be the right neighbor.
+ */
+ lptr = cptr;
+ left = block;
+ lbp = bp;
+ error = xfs_btree_read_buf_block(cur, &rptr, level,
+ 0, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /*
+ * Otherwise, we can't fix the imbalance.
+ * Just return. This is probably a logic error, but it's not fatal.
+ */
+ } else {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+
+ rrecs = xfs_btree_get_numrecs(right);
+ lrecs = xfs_btree_get_numrecs(left);
+
+ /*
+ * We're now going to join "left" and "right" by moving all the stuff
+ * in "right" to "left" and deleting "right".
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ union xfs_btree_ptr *rpp; /* right address pointer */
+
+ lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+ lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+ for (i = 1; i < rrecs; i++) {
+ error = xfs_btree_check_ptr(cur, rpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+ xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+ xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+
+ xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+ xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+ union xfs_btree_rec *rrp; /* right record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+ xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+ }
+
+ XFS_BTREE_STATS_INC(cur, join);
+
+ /*
+ * Fix up the number of records and right block pointer in the
+ * surviving block, and log it.
+ */
+ xfs_btree_set_numrecs(left, lrecs + rrecs);
+ xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+ xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+ /* If there is a right sibling, point it to the remaining block. */
+ xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+ error = xfs_btree_read_buf_block(cur, &cptr, level,
+ 0, &rrblock, &rrbp);
+ if (error)
+ goto error0;
+ xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+ xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+ }
+
+ /* Free the deleted block. */
+ error = cur->bc_ops->free_block(cur, rbp);
+ if (error)
+ goto error0;
+ XFS_BTREE_STATS_INC(cur, free);
+
+ /*
+ * If we joined with the left neighbor, set the buffer in the
+ * cursor to the left block, and fix up the index.
+ */
+ if (bp != lbp) {
+ cur->bc_bufs[level] = lbp;
+ cur->bc_ptrs[level] += lrecs;
+ cur->bc_ra[level] = 0;
+ }
+ /*
+ * If we joined with the right neighbor and there's a level above
+ * us, increment the cursor at that level.
+ */
+ else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+ (level + 1 < cur->bc_nlevels)) {
+ error = xfs_btree_increment(cur, level + 1, &i);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * Readjust the ptr at this level if it's not a leaf, since it's
+ * still pointing at the deletion point, which makes the cursor
+ * inconsistent. If this makes the ptr 0, the caller fixes it up.
+ * We can't use decrement because it would change the next level up.
+ */
+ if (level > 0)
+ cur->bc_ptrs[level]--;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ /* Return value means the next level up has something to do. */
+ *stat = 2;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ if (tcur)
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int /* error */
+xfs_btree_delete(
+ struct xfs_btree_cur *cur,
+ int *stat) /* success/failure */
+{
+ int error; /* error return value */
+ int level;
+ int i;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ /*
+ * Go up the tree, starting at leaf level.
+ *
+ * If 2 is returned then a join was done; go to the next level.
+ * Otherwise we are done.
+ */
+ for (level = 0, i = 2; i == 2; level++) {
+ error = xfs_btree_delrec(cur, level, &i);
+ if (error)
+ goto error0;
+ }
+
+ if (i == 0) {
+ for (level = 1; level < cur->bc_nlevels; level++) {
+ if (cur->bc_ptrs[level] == 0) {
+ error = xfs_btree_decrement(cur, level, &i);
+ if (error)
+ goto error0;
+ break;
+ }
+ }
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = i;
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int /* error */
+xfs_btree_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ union xfs_btree_rec **recp, /* output: btree record */
+ int *stat) /* output: success/failure */
+{
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_buf *bp; /* buffer pointer */
+ int ptr; /* record number */
+#ifdef DEBUG
+ int error; /* error return value */
+#endif
+
+ ptr = cur->bc_ptrs[0];
+ block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, 0, bp);
+ if (error)
+ return error;
+#endif
+
+ /*
+ * Off the right end or left end, return failure.
+ */
+ if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+ *stat = 0;
+ return 0;
+ }
+
+ /*
+ * Point to the record and extract its data.
+ */
+ *recp = xfs_btree_rec_addr(cur, ptr, block);
+ *stat = 1;
+ return 0;
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
new file mode 100644
index 00000000..82fafc66
--- /dev/null
+++ b/fs/xfs/xfs_btree.h
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_BTREE_H__
+#define __XFS_BTREE_H__
+
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+extern kmem_zone_t *xfs_btree_cur_zone;
+
+/*
+ * This nonsense is to make -wlint happy.
+ */
+#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
+
+#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
+
+/*
+ * Generic btree header.
+ *
+ * This is a combination of the actual format used on disk for short and long
+ * format btrees. The first three fields are shared by both format, but
+ * the pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use
+ * the size macros below. Never use sizeof(xfs_btree_block).
+ */
+struct xfs_btree_block {
+ __be32 bb_magic; /* magic number for block type */
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+ union {
+ struct {
+ __be32 bb_leftsib;
+ __be32 bb_rightsib;
+ } s; /* short form pointers */
+ struct {
+ __be64 bb_leftsib;
+ __be64 bb_rightsib;
+ } l; /* long form pointers */
+ } bb_u; /* rest */
+};
+
+#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
+
+
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+ __be32 s; /* short form ptr */
+ __be64 l; /* long form ptr */
+};
+
+union xfs_btree_key {
+ xfs_bmbt_key_t bmbt;
+ xfs_bmdr_key_t bmbr; /* bmbt root block */
+ xfs_alloc_key_t alloc;
+ xfs_inobt_key_t inobt;
+};
+
+union xfs_btree_rec {
+ xfs_bmbt_rec_t bmbt;
+ xfs_bmdr_rec_t bmbr; /* bmbt root block */
+ xfs_alloc_rec_t alloc;
+ xfs_inobt_rec_t inobt;
+};
+
+/*
+ * For logging record fields.
+ */
+#define XFS_BB_MAGIC 0x01
+#define XFS_BB_LEVEL 0x02
+#define XFS_BB_NUMRECS 0x04
+#define XFS_BB_LEFTSIB 0x08
+#define XFS_BB_RIGHTSIB 0x10
+#define XFS_BB_NUM_BITS 5
+#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
+
+/*
+ * Magic numbers for btree blocks.
+ */
+extern const __uint32_t xfs_magics[];
+
+/*
+ * Generic stats interface
+ */
+#define __XFS_BTREE_STATS_INC(type, stat) \
+ XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat) \
+do { \
+ switch (cur->bc_btnum) { \
+ case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \
+ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
+ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
+ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
+ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
+ } \
+} while (0)
+
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
+ XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define XFS_BTREE_STATS_ADD(cur, stat, val) \
+do { \
+ switch (cur->bc_btnum) { \
+ case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
+ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
+ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
+ case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
+ } \
+} while (0)
+
+#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */
+
+struct xfs_btree_ops {
+ /* size of the key and record structures */
+ size_t key_len;
+ size_t rec_len;
+
+ /* cursor operations */
+ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+ void (*update_cursor)(struct xfs_btree_cur *src,
+ struct xfs_btree_cur *dst);
+
+ /* update btree root pointer */
+ void (*set_root)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *nptr, int level_change);
+
+ /* block allocation / freeing */
+ int (*alloc_block)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start_bno,
+ union xfs_btree_ptr *new_bno,
+ int length, int *stat);
+ int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+ /* update last record information */
+ void (*update_lastrec)(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_rec *rec,
+ int ptr, int reason);
+
+ /* records in block/level */
+ int (*get_minrecs)(struct xfs_btree_cur *cur, int level);
+ int (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+
+ /* records on disk. Matter for the root in inode case. */
+ int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+
+ /* init values of btree structures */
+ void (*init_key_from_rec)(union xfs_btree_key *key,
+ union xfs_btree_rec *rec);
+ void (*init_rec_from_key)(union xfs_btree_key *key,
+ union xfs_btree_rec *rec);
+ void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec);
+ void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
+
+ /* difference between key value and cursor value */
+ __int64_t (*key_diff)(struct xfs_btree_cur *cur,
+ union xfs_btree_key *key);
+
+#ifdef DEBUG
+ /* check that k1 is lower than k2 */
+ int (*keys_inorder)(struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2);
+
+ /* check that r1 is lower than r2 */
+ int (*recs_inorder)(struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2);
+#endif
+
+ /* btree tracing */
+#ifdef XFS_BTREE_TRACE
+ void (*trace_enter)(struct xfs_btree_cur *, const char *,
+ char *, int, int, __psunsigned_t,
+ __psunsigned_t, __psunsigned_t,
+ __psunsigned_t, __psunsigned_t,
+ __psunsigned_t, __psunsigned_t,
+ __psunsigned_t, __psunsigned_t,
+ __psunsigned_t, __psunsigned_t);
+ void (*trace_cursor)(struct xfs_btree_cur *, __uint32_t *,
+ __uint64_t *, __uint64_t *);
+ void (*trace_key)(struct xfs_btree_cur *,
+ union xfs_btree_key *, __uint64_t *,
+ __uint64_t *);
+ void (*trace_record)(struct xfs_btree_cur *,
+ union xfs_btree_rec *, __uint64_t *,
+ __uint64_t *, __uint64_t *);
+#endif
+};
+
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE 0
+#define LASTREC_INSREC 1
+#define LASTREC_DELREC 2
+
+
+/*
+ * Btree cursor structure.
+ * This collects all information needed by the btree code in one place.
+ */
+typedef struct xfs_btree_cur
+{
+ struct xfs_trans *bc_tp; /* transaction we're in, if any */
+ struct xfs_mount *bc_mp; /* file system mount struct */
+ const struct xfs_btree_ops *bc_ops;
+ uint bc_flags; /* btree features - below */
+ union {
+ xfs_alloc_rec_incore_t a;
+ xfs_bmbt_irec_t b;
+ xfs_inobt_rec_incore_t i;
+ } bc_rec; /* current insert/search record value */
+ struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */
+ int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */
+ __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */
+#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */
+#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */
+ __uint8_t bc_nlevels; /* number of levels in the tree */
+ __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
+ xfs_btnum_t bc_btnum; /* identifies which btree type */
+ union {
+ struct { /* needed for BNO, CNT, INO */
+ struct xfs_buf *agbp; /* agf/agi buffer pointer */
+ xfs_agnumber_t agno; /* ag number */
+ } a;
+ struct { /* needed for BMAP */
+ struct xfs_inode *ip; /* pointer to our inode */
+ struct xfs_bmap_free *flist; /* list to free after */
+ xfs_fsblock_t firstblock; /* 1st blk allocated */
+ int allocated; /* count of alloced */
+ short forksize; /* fork's inode space */
+ char whichfork; /* data or attr fork */
+ char flags; /* flags */
+#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
+ } b;
+ } bc_private; /* per-btree type data */
+} xfs_btree_cur_t;
+
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
+
+
+#define XFS_BTREE_NOERROR 0
+#define XFS_BTREE_ERROR 1
+
+/*
+ * Convert from buffer to btree block header.
+ */
+#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)XFS_BUF_PTR(bp))
+
+
+/*
+ * Check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* generic btree block pointer */
+ int level, /* level of the btree block */
+ struct xfs_buf *bp); /* buffer containing block, if any */
+
+/*
+ * Check that (long) pointer is ok.
+ */
+int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_dfsbno_t ptr, /* btree block disk address */
+ int level); /* btree block level */
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int error); /* del because of error */
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int /* error */
+xfs_btree_dup_cursor(
+ xfs_btree_cur_t *cur, /* input cursor */
+ xfs_btree_cur_t **ncur);/* output cursor */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+struct xfs_buf * /* buffer for fsbno */
+xfs_btree_get_bufl(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ uint lock); /* lock flags for get_buf */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+struct xfs_buf * /* buffer for agno/agbno */
+xfs_btree_get_bufs(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_agblock_t agbno, /* allocation group block number */
+ uint lock); /* lock flags for get_buf */
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int /* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int level); /* level to check */
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+ __int64_t fields, /* bitmask of fields */
+ const short *offsets,/* table of field offsets */
+ int nbits, /* number of bits to inspect */
+ int *first, /* output: first byte offset */
+ int *last); /* output: last byte offset */
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int /* error */
+xfs_btree_read_bufl(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ uint lock, /* lock flags for read_buf */
+ struct xfs_buf **bpp, /* buffer for fsbno */
+ int refval);/* ref count value for buffer */
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+void /* error */
+xfs_btree_reada_bufl(
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_fsblock_t fsbno, /* file system block number */
+ xfs_extlen_t count); /* count of filesystem blocks */
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+void /* error */
+xfs_btree_reada_bufs(
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_agblock_t agbno, /* allocation group block number */
+ xfs_extlen_t count); /* count of filesystem blocks */
+
+
+/*
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
+
+/*
+ * Helpers.
+ */
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
+{
+ return be16_to_cpu(block->bb_numrecs);
+}
+
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+ __uint16_t numrecs)
+{
+ block->bb_numrecs = cpu_to_be16(numrecs);
+}
+
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+ return be16_to_cpu(block->bb_level);
+}
+
+
+/*
+ * Min and max functions for extlen, agblock, fileoff, and filblks types.
+ */
+#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b))
+#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b))
+#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b))
+#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b))
+#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b))
+#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b))
+#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b))
+#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
+
+#define XFS_FSB_SANITY_CHECK(mp,fsb) \
+ (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+ XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+
+#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c
new file mode 100644
index 00000000..44ff942a
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+
+STATIC void
+xfs_btree_trace_ptr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr ptr,
+ __psunsigned_t *high,
+ __psunsigned_t *low)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ __u64 val = be64_to_cpu(ptr.l);
+ *high = val >> 32;
+ *low = (int)val;
+ } else {
+ *high = 0;
+ *low = be32_to_cpu(ptr.s);
+ }
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
+ */
+void
+xfs_btree_trace_argbi(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *b,
+ int i,
+ int line)
+{
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI,
+ line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
+ */
+void
+xfs_btree_trace_argbii(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *b,
+ int i0,
+ int i1,
+ int line)
+{
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII,
+ line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0,
+ 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for 3 block-length args
+ * and an integer arg.
+ */
+void
+xfs_btree_trace_argfffi(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ xfs_dfiloff_t o,
+ xfs_dfsbno_t b,
+ xfs_dfilblks_t i,
+ int j,
+ int line)
+{
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI,
+ line,
+ o >> 32, (int)o,
+ b >> 32, (int)b,
+ i >> 32, (int)i,
+ (int)j, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for one integer arg.
+ */
+void
+xfs_btree_trace_argi(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ int i,
+ int line)
+{
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI,
+ line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, key.
+ */
+void
+xfs_btree_trace_argipk(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ int i,
+ union xfs_btree_ptr ptr,
+ union xfs_btree_key *key,
+ int line)
+{
+ __psunsigned_t high, low;
+ __uint64_t l0, l1;
+
+ xfs_btree_trace_ptr(cur, ptr, &high, &low);
+ cur->bc_ops->trace_key(cur, key, &l0, &l1);
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK,
+ line, i, high, low,
+ l0 >> 32, (int)l0,
+ l1 >> 32, (int)l1,
+ 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, rec.
+ */
+void
+xfs_btree_trace_argipr(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ int i,
+ union xfs_btree_ptr ptr,
+ union xfs_btree_rec *rec,
+ int line)
+{
+ __psunsigned_t high, low;
+ __uint64_t l0, l1, l2;
+
+ xfs_btree_trace_ptr(cur, ptr, &high, &low);
+ cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR,
+ line, i,
+ high, low,
+ l0 >> 32, (int)l0,
+ l1 >> 32, (int)l1,
+ l2 >> 32, (int)l2,
+ 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, key.
+ */
+void
+xfs_btree_trace_argik(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ int i,
+ union xfs_btree_key *key,
+ int line)
+{
+ __uint64_t l0, l1;
+
+ cur->bc_ops->trace_key(cur, key, &l0, &l1);
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK,
+ line, i,
+ l0 >> 32, (int)l0,
+ l1 >> 32, (int)l1,
+ 0, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for record.
+ */
+void
+xfs_btree_trace_argr(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ int line)
+{
+ __uint64_t l0, l1, l2;
+
+ cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR,
+ line,
+ l0 >> 32, (int)l0,
+ l1 >> 32, (int)l1,
+ l2 >> 32, (int)l2,
+ 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for the cursor/operation.
+ */
+void
+xfs_btree_trace_cursor(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ int type,
+ int line)
+{
+ __uint32_t s0;
+ __uint64_t l0, l1;
+ char *s;
+
+ switch (type) {
+ case XBT_ARGS:
+ s = "args";
+ break;
+ case XBT_ENTRY:
+ s = "entry";
+ break;
+ case XBT_ERROR:
+ s = "error";
+ break;
+ case XBT_EXIT:
+ s = "exit";
+ break;
+ default:
+ s = "unknown";
+ break;
+ }
+
+ cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1);
+ cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line,
+ s0,
+ l0 >> 32, (int)l0,
+ l1 >> 32, (int)l1,
+ (__psunsigned_t)cur->bc_bufs[0],
+ (__psunsigned_t)cur->bc_bufs[1],
+ (__psunsigned_t)cur->bc_bufs[2],
+ (__psunsigned_t)cur->bc_bufs[3],
+ (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
+ (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
+}
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
new file mode 100644
index 00000000..2d8a3098
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_BTREE_TRACE_H__
+#define __XFS_BTREE_TRACE_H__
+
+struct xfs_btree_cur;
+struct xfs_buf;
+
+
+/*
+ * Trace hooks.
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+
+#ifdef XFS_BTREE_TRACE
+
+/*
+ * Trace buffer entry types.
+ */
+#define XFS_BTREE_KTRACE_ARGBI 1
+#define XFS_BTREE_KTRACE_ARGBII 2
+#define XFS_BTREE_KTRACE_ARGFFFI 3
+#define XFS_BTREE_KTRACE_ARGI 4
+#define XFS_BTREE_KTRACE_ARGIPK 5
+#define XFS_BTREE_KTRACE_ARGIPR 6
+#define XFS_BTREE_KTRACE_ARGIK 7
+#define XFS_BTREE_KTRACE_ARGR 8
+#define XFS_BTREE_KTRACE_CUR 9
+
+/*
+ * Sub-types for cursor traces.
+ */
+#define XBT_ARGS 0
+#define XBT_ENTRY 1
+#define XBT_ERROR 2
+#define XBT_EXIT 3
+
+void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
+ struct xfs_buf *, int, int);
+void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
+ struct xfs_buf *, int, int, int);
+void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
+void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
+ union xfs_btree_ptr, union xfs_btree_key *, int);
+void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int,
+ union xfs_btree_ptr, union xfs_btree_rec *, int);
+void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int,
+ union xfs_btree_key *, int);
+void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
+ union xfs_btree_rec *, int);
+void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
+
+#define XFS_BTREE_TRACE_ARGBI(c, b, i) \
+ xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) \
+ xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
+#define XFS_BTREE_TRACE_ARGI(c, i) \
+ xfs_btree_trace_argi(__func__, c, i, __LINE__)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, k) \
+ xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) \
+ xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k) \
+ xfs_btree_trace_argik(__func__, c, i, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGR(c, r) \
+ xfs_btree_trace_argr(__func__, c, r, __LINE__)
+#define XFS_BTREE_TRACE_CURSOR(c, t) \
+ xfs_btree_trace_cursor(__func__, c, t, __LINE__)
+#else
+#define XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define XFS_BTREE_TRACE_ARGI(c, i)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define XFS_BTREE_TRACE_CURSOR(c, t)
+#endif /* XFS_BTREE_TRACE */
+
+#endif /* __XFS_BTREE_TRACE_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
new file mode 100644
index 00000000..7888a756
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.c
@@ -0,0 +1,1064 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+
+kmem_zone_t *xfs_buf_item_zone;
+
+static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_buf_log_item, bli_item);
+}
+
+
+#ifdef XFS_TRANS_DEBUG
+/*
+ * This function uses an alternate strategy for tracking the bytes
+ * that the user requests to be logged. This can then be used
+ * in conjunction with the bli_orig array in the buf log item to
+ * catch bugs in our callers' code.
+ *
+ * We also double check the bits set in xfs_buf_item_log using a
+ * simple algorithm to check that every byte is accounted for.
+ */
+STATIC void
+xfs_buf_item_log_debug(
+ xfs_buf_log_item_t *bip,
+ uint first,
+ uint last)
+{
+ uint x;
+ uint byte;
+ uint nbytes;
+ uint chunk_num;
+ uint word_num;
+ uint bit_num;
+ uint bit_set;
+ uint *wordp;
+
+ ASSERT(bip->bli_logged != NULL);
+ byte = first;
+ nbytes = last - first + 1;
+ bfset(bip->bli_logged, first, nbytes);
+ for (x = 0; x < nbytes; x++) {
+ chunk_num = byte >> XFS_BLF_SHIFT;
+ word_num = chunk_num >> BIT_TO_WORD_SHIFT;
+ bit_num = chunk_num & (NBWORD - 1);
+ wordp = &(bip->bli_format.blf_data_map[word_num]);
+ bit_set = *wordp & (1 << bit_num);
+ ASSERT(bit_set);
+ byte++;
+ }
+}
+
+/*
+ * This function is called when we flush something into a buffer without
+ * logging it. This happens for things like inodes which are logged
+ * separately from the buffer.
+ */
+void
+xfs_buf_item_flush_log_debug(
+ xfs_buf_t *bp,
+ uint first,
+ uint last)
+{
+ xfs_buf_log_item_t *bip;
+ uint nbytes;
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+ if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) {
+ return;
+ }
+
+ ASSERT(bip->bli_logged != NULL);
+ nbytes = last - first + 1;
+ bfset(bip->bli_logged, first, nbytes);
+}
+
+/*
+ * This function is called to verify that our callers have logged
+ * all the bytes that they changed.
+ *
+ * It does this by comparing the original copy of the buffer stored in
+ * the buf log item's bli_orig array to the current copy of the buffer
+ * and ensuring that all bytes which mismatch are set in the bli_logged
+ * array of the buf log item.
+ */
+STATIC void
+xfs_buf_item_log_check(
+ xfs_buf_log_item_t *bip)
+{
+ char *orig;
+ char *buffer;
+ int x;
+ xfs_buf_t *bp;
+
+ ASSERT(bip->bli_orig != NULL);
+ ASSERT(bip->bli_logged != NULL);
+
+ bp = bip->bli_buf;
+ ASSERT(XFS_BUF_COUNT(bp) > 0);
+ ASSERT(XFS_BUF_PTR(bp) != NULL);
+ orig = bip->bli_orig;
+ buffer = XFS_BUF_PTR(bp);
+ for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
+ if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
+ xfs_emerg(bp->b_mount,
+ "%s: bip %x buffer %x orig %x index %d",
+ __func__, bip, bp, orig, x);
+ ASSERT(0);
+ }
+ }
+}
+#else
+#define xfs_buf_item_log_debug(x,y,z)
+#define xfs_buf_item_log_check(x)
+#endif
+
+STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
+
+/*
+ * This returns the number of log iovecs needed to log the
+ * given buf log item.
+ *
+ * It calculates this as 1 iovec for the buf log format structure
+ * and 1 for each stretch of non-contiguous chunks to be logged.
+ * Contiguous chunks are logged in a single iovec.
+ *
+ * If the XFS_BLI_STALE flag has been set, then log nothing.
+ */
+STATIC uint
+xfs_buf_item_size(
+ struct xfs_log_item *lip)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+ uint nvecs;
+ int next_bit;
+ int last_bit;
+
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ if (bip->bli_flags & XFS_BLI_STALE) {
+ /*
+ * The buffer is stale, so all we need to log
+ * is the buf log format structure with the
+ * cancel flag in it.
+ */
+ trace_xfs_buf_item_size_stale(bip);
+ ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+ return 1;
+ }
+
+ ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+ nvecs = 1;
+ last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+ bip->bli_format.blf_map_size, 0);
+ ASSERT(last_bit != -1);
+ nvecs++;
+ while (last_bit != -1) {
+ /*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there. It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ */
+ next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+ bip->bli_format.blf_map_size,
+ last_bit + 1);
+ /*
+ * If we run out of bits, leave the loop,
+ * else if we find a new set of bits bump the number of vecs,
+ * else keep scanning the current set of bits.
+ */
+ if (next_bit == -1) {
+ last_bit = -1;
+ } else if (next_bit != last_bit + 1) {
+ last_bit = next_bit;
+ nvecs++;
+ } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
+ (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
+ XFS_BLF_CHUNK)) {
+ last_bit = next_bit;
+ nvecs++;
+ } else {
+ last_bit++;
+ }
+ }
+
+ trace_xfs_buf_item_size(bip);
+ return nvecs;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given log buf item. It fills the first entry with a buf log
+ * format structure, and the rest point to contiguous chunks
+ * within the buffer.
+ */
+STATIC void
+xfs_buf_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *vecp)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+ uint base_size;
+ uint nvecs;
+ int first_bit;
+ int last_bit;
+ int next_bit;
+ uint nbits;
+ uint buffer_offset;
+
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+ (bip->bli_flags & XFS_BLI_STALE));
+
+ /*
+ * The size of the base structure is the size of the
+ * declared structure plus the space for the extra words
+ * of the bitmap. We subtract one from the map size, because
+ * the first element of the bitmap is accounted for in the
+ * size of the base structure.
+ */
+ base_size =
+ (uint)(sizeof(xfs_buf_log_format_t) +
+ ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
+ vecp->i_addr = &bip->bli_format;
+ vecp->i_len = base_size;
+ vecp->i_type = XLOG_REG_TYPE_BFORMAT;
+ vecp++;
+ nvecs = 1;
+
+ /*
+ * If it is an inode buffer, transfer the in-memory state to the
+ * format flags and clear the in-memory state. We do not transfer
+ * this state if the inode buffer allocation has not yet been committed
+ * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+ * correct replay of the inode allocation.
+ */
+ if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+ if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+ xfs_log_item_in_current_chkpt(lip)))
+ bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+ bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+ }
+
+ if (bip->bli_flags & XFS_BLI_STALE) {
+ /*
+ * The buffer is stale, so all we need to log
+ * is the buf log format structure with the
+ * cancel flag in it.
+ */
+ trace_xfs_buf_item_format_stale(bip);
+ ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+ bip->bli_format.blf_size = nvecs;
+ return;
+ }
+
+ /*
+ * Fill in an iovec for each set of contiguous chunks.
+ */
+ first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+ bip->bli_format.blf_map_size, 0);
+ ASSERT(first_bit != -1);
+ last_bit = first_bit;
+ nbits = 1;
+ for (;;) {
+ /*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there. It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ */
+ next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+ bip->bli_format.blf_map_size,
+ (uint)last_bit + 1);
+ /*
+ * If we run out of bits fill in the last iovec and get
+ * out of the loop.
+ * Else if we start a new set of bits then fill in the
+ * iovec for the series we were looking at and start
+ * counting the bits in the new one.
+ * Else we're still in the same set of bits so just
+ * keep counting and scanning.
+ */
+ if (next_bit == -1) {
+ buffer_offset = first_bit * XFS_BLF_CHUNK;
+ vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+ vecp->i_len = nbits * XFS_BLF_CHUNK;
+ vecp->i_type = XLOG_REG_TYPE_BCHUNK;
+ nvecs++;
+ break;
+ } else if (next_bit != last_bit + 1) {
+ buffer_offset = first_bit * XFS_BLF_CHUNK;
+ vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+ vecp->i_len = nbits * XFS_BLF_CHUNK;
+ vecp->i_type = XLOG_REG_TYPE_BCHUNK;
+ nvecs++;
+ vecp++;
+ first_bit = next_bit;
+ last_bit = next_bit;
+ nbits = 1;
+ } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
+ (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
+ XFS_BLF_CHUNK)) {
+ buffer_offset = first_bit * XFS_BLF_CHUNK;
+ vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+ vecp->i_len = nbits * XFS_BLF_CHUNK;
+ vecp->i_type = XLOG_REG_TYPE_BCHUNK;
+/* You would think we need to bump the nvecs here too, but we do not
+ * this number is used by recovery, and it gets confused by the boundary
+ * split here
+ * nvecs++;
+ */
+ vecp++;
+ first_bit = next_bit;
+ last_bit = next_bit;
+ nbits = 1;
+ } else {
+ last_bit++;
+ nbits++;
+ }
+ }
+ bip->bli_format.blf_size = nvecs;
+
+ /*
+ * Check to make sure everything is consistent.
+ */
+ trace_xfs_buf_item_format(bip);
+ xfs_buf_item_log_check(bip);
+}
+
+/*
+ * This is called to pin the buffer associated with the buf log item in memory
+ * so it cannot be written out.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
+ */
+STATIC void
+xfs_buf_item_pin(
+ struct xfs_log_item *lip)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+
+ ASSERT(XFS_BUF_ISBUSY(bip->bli_buf));
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+ (bip->bli_flags & XFS_BLI_STALE));
+
+ trace_xfs_buf_item_pin(bip);
+
+ atomic_inc(&bip->bli_refcount);
+ atomic_inc(&bip->bli_buf->b_pin_count);
+}
+
+/*
+ * This is called to unpin the buffer associated with the buf log
+ * item which was previously pinned with a call to xfs_buf_item_pin().
+ *
+ * Also drop the reference to the buf item for the current transaction.
+ * If the XFS_BLI_STALE flag is set and we are the last reference,
+ * then free up the buf log item and unlock the buffer.
+ *
+ * If the remove flag is set we are called from uncommit in the
+ * forced-shutdown path. If that is true and the reference count on
+ * the log item is going to drop to zero we need to free the item's
+ * descriptor in the transaction.
+ */
+STATIC void
+xfs_buf_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ xfs_buf_t *bp = bip->bli_buf;
+ struct xfs_ail *ailp = lip->li_ailp;
+ int stale = bip->bli_flags & XFS_BLI_STALE;
+ int freed;
+
+ ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ trace_xfs_buf_item_unpin(bip);
+
+ freed = atomic_dec_and_test(&bip->bli_refcount);
+
+ if (atomic_dec_and_test(&bp->b_pin_count))
+ wake_up_all(&bp->b_waiters);
+
+ if (freed && stale) {
+ ASSERT(bip->bli_flags & XFS_BLI_STALE);
+ ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+ ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+ ASSERT(XFS_BUF_ISSTALE(bp));
+ ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+
+ trace_xfs_buf_item_unpin_stale(bip);
+
+ if (remove) {
+ /*
+ * If we are in a transaction context, we have to
+ * remove the log item from the transaction as we are
+ * about to release our reference to the buffer. If we
+ * don't, the unlock that occurs later in
+ * xfs_trans_uncommit() will try to reference the
+ * buffer which we no longer have a hold on.
+ */
+ if (lip->li_desc)
+ xfs_trans_del_item(lip);
+
+ /*
+ * Since the transaction no longer refers to the buffer,
+ * the buffer should no longer refer to the transaction.
+ */
+ XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+ }
+
+ /*
+ * If we get called here because of an IO error, we may
+ * or may not have the item on the AIL. xfs_trans_ail_delete()
+ * will take care of that situation.
+ * xfs_trans_ail_delete() drops the AIL lock.
+ */
+ if (bip->bli_flags & XFS_BLI_STALE_INODE) {
+ xfs_buf_do_callbacks(bp);
+ XFS_BUF_SET_FSPRIVATE(bp, NULL);
+ XFS_BUF_CLR_IODONE_FUNC(bp);
+ } else {
+ spin_lock(&ailp->xa_lock);
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
+ xfs_buf_item_relse(bp);
+ ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
+ }
+ xfs_buf_relse(bp);
+ }
+}
+
+/*
+ * This is called to attempt to lock the buffer associated with this
+ * buf log item. Don't sleep on the buffer lock. If we can't get
+ * the lock right away, return 0. If we can get the lock, take a
+ * reference to the buffer. If this is a delayed write buffer that
+ * needs AIL help to be written back, invoke the pushbuf routine
+ * rather than the normal success path.
+ */
+STATIC uint
+xfs_buf_item_trylock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+
+ if (XFS_BUF_ISPINNED(bp))
+ return XFS_ITEM_PINNED;
+ if (!XFS_BUF_CPSEMA(bp))
+ return XFS_ITEM_LOCKED;
+
+ /* take a reference to the buffer. */
+ XFS_BUF_HOLD(bp);
+
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ trace_xfs_buf_item_trylock(bip);
+ if (XFS_BUF_ISDELAYWRITE(bp))
+ return XFS_ITEM_PUSHBUF;
+ return XFS_ITEM_SUCCESS;
+}
+
+/*
+ * Release the buffer associated with the buf log item. If there is no dirty
+ * logged data associated with the buffer recorded in the buf log item, then
+ * free the buf log item and remove the reference to it in the buffer.
+ *
+ * This call ignores the recursion count. It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
+ *
+ * We unconditionally drop the transaction's reference to the log item. If the
+ * item was logged, then another reference was taken when it was pinned, so we
+ * can safely drop the transaction reference now. This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
+ *
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
+ * if necessary but do not unlock the buffer. This is for support of
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
+ * free the item.
+ */
+STATIC void
+xfs_buf_item_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+ int aborted;
+ uint hold;
+
+ /* Clear the buffer's association with this transaction. */
+ XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+
+ /*
+ * If this is a transaction abort, don't return early. Instead, allow
+ * the brelse to happen. Normally it would be done for stale
+ * (cancelled) buffers at unpin time, but we'll never go through the
+ * pin/unpin cycle if we abort inside commit.
+ */
+ aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
+
+ /*
+ * Before possibly freeing the buf item, determine if we should
+ * release the buffer at the end of this routine.
+ */
+ hold = bip->bli_flags & XFS_BLI_HOLD;
+
+ /* Clear the per transaction state. */
+ bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
+
+ /*
+ * If the buf item is marked stale, then don't do anything. We'll
+ * unlock the buffer and free the buf item when the buffer is unpinned
+ * for the last time.
+ */
+ if (bip->bli_flags & XFS_BLI_STALE) {
+ trace_xfs_buf_item_unlock_stale(bip);
+ ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+ if (!aborted) {
+ atomic_dec(&bip->bli_refcount);
+ return;
+ }
+ }
+
+ trace_xfs_buf_item_unlock(bip);
+
+ /*
+ * If the buf item isn't tracking any data, free it, otherwise drop the
+ * reference we hold to it.
+ */
+ if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
+ bip->bli_format.blf_map_size))
+ xfs_buf_item_relse(bp);
+ else
+ atomic_dec(&bip->bli_refcount);
+
+ if (!hold)
+ xfs_buf_relse(bp);
+}
+
+/*
+ * This is called to find out where the oldest active copy of the
+ * buf log item in the on disk log resides now that the last log
+ * write of it completed at the given lsn.
+ * We always re-log all the dirty data in a buffer, so usually the
+ * latest copy in the on disk log is the only one that matters. For
+ * those cases we simply return the given lsn.
+ *
+ * The one exception to this is for buffers full of newly allocated
+ * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF
+ * flag set, indicating that only the di_next_unlinked fields from the
+ * inodes in the buffers will be replayed during recovery. If the
+ * original newly allocated inode images have not yet been flushed
+ * when the buffer is so relogged, then we need to make sure that we
+ * keep the old images in the 'active' portion of the log. We do this
+ * by returning the original lsn of that transaction here rather than
+ * the current one.
+ */
+STATIC xfs_lsn_t
+xfs_buf_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+
+ trace_xfs_buf_item_committed(bip);
+
+ if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0)
+ return lip->li_lsn;
+ return lsn;
+}
+
+/*
+ * The buffer is locked, but is not a delayed write buffer. This happens
+ * if we race with IO completion and hence we don't want to try to write it
+ * again. Just release the buffer.
+ */
+STATIC void
+xfs_buf_item_push(
+ struct xfs_log_item *lip)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+
+ trace_xfs_buf_item_push(bip);
+
+ xfs_buf_relse(bp);
+}
+
+/*
+ * The buffer is locked and is a delayed write buffer. Promote the buffer
+ * in the delayed write queue as the caller knows that they must invoke
+ * the xfsbufd to get this buffer written. We have to unlock the buffer
+ * to allow the xfsbufd to write it, too.
+ */
+STATIC bool
+xfs_buf_item_pushbuf(
+ struct xfs_log_item *lip)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ ASSERT(XFS_BUF_ISDELAYWRITE(bp));
+
+ trace_xfs_buf_item_pushbuf(bip);
+
+ xfs_buf_delwri_promote(bp);
+ xfs_buf_relse(bp);
+ return true;
+}
+
+STATIC void
+xfs_buf_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t commit_lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_buf_item_ops = {
+ .iop_size = xfs_buf_item_size,
+ .iop_format = xfs_buf_item_format,
+ .iop_pin = xfs_buf_item_pin,
+ .iop_unpin = xfs_buf_item_unpin,
+ .iop_trylock = xfs_buf_item_trylock,
+ .iop_unlock = xfs_buf_item_unlock,
+ .iop_committed = xfs_buf_item_committed,
+ .iop_push = xfs_buf_item_push,
+ .iop_pushbuf = xfs_buf_item_pushbuf,
+ .iop_committing = xfs_buf_item_committing
+};
+
+
+/*
+ * Allocate a new buf log item to go with the given buffer.
+ * Set the buffer's b_fsprivate field to point to the new
+ * buf log item. If there are other item's attached to the
+ * buffer (see xfs_buf_attach_iodone() below), then put the
+ * buf log item at the front.
+ */
+void
+xfs_buf_item_init(
+ xfs_buf_t *bp,
+ xfs_mount_t *mp)
+{
+ xfs_log_item_t *lip;
+ xfs_buf_log_item_t *bip;
+ int chunks;
+ int map_size;
+
+ /*
+ * Check to see if there is already a buf log item for
+ * this buffer. If there is, it is guaranteed to be
+ * the first. If we do already have one, there is
+ * nothing to do here so return.
+ */
+ ASSERT(bp->b_target->bt_mount == mp);
+ if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+ lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ if (lip->li_type == XFS_LI_BUF) {
+ return;
+ }
+ }
+
+ /*
+ * chunks is the number of XFS_BLF_CHUNK size pieces
+ * the buffer can be divided into. Make sure not to
+ * truncate any pieces. map_size is the size of the
+ * bitmap needed to describe the chunks of the buffer.
+ */
+ chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
+ map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
+
+ bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
+ KM_SLEEP);
+ xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
+ bip->bli_buf = bp;
+ xfs_buf_hold(bp);
+ bip->bli_format.blf_type = XFS_LI_BUF;
+ bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
+ bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
+ bip->bli_format.blf_map_size = map_size;
+
+#ifdef XFS_TRANS_DEBUG
+ /*
+ * Allocate the arrays for tracking what needs to be logged
+ * and what our callers request to be logged. bli_orig
+ * holds a copy of the original, clean buffer for comparison
+ * against, and bli_logged keeps a 1 bit flag per byte in
+ * the buffer to indicate which bytes the callers have asked
+ * to have logged.
+ */
+ bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
+ memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
+ bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
+#endif
+
+ /*
+ * Put the buf item into the list of items attached to the
+ * buffer at the front.
+ */
+ if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+ bip->bli_item.li_bio_list =
+ XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ }
+ XFS_BUF_SET_FSPRIVATE(bp, bip);
+}
+
+
+/*
+ * Mark bytes first through last inclusive as dirty in the buf
+ * item's bitmap.
+ */
+void
+xfs_buf_item_log(
+ xfs_buf_log_item_t *bip,
+ uint first,
+ uint last)
+{
+ uint first_bit;
+ uint last_bit;
+ uint bits_to_set;
+ uint bits_set;
+ uint word_num;
+ uint *wordp;
+ uint bit;
+ uint end_bit;
+ uint mask;
+
+ /*
+ * Mark the item as having some dirty data for
+ * quick reference in xfs_buf_item_dirty.
+ */
+ bip->bli_flags |= XFS_BLI_DIRTY;
+
+ /*
+ * Convert byte offsets to bit numbers.
+ */
+ first_bit = first >> XFS_BLF_SHIFT;
+ last_bit = last >> XFS_BLF_SHIFT;
+
+ /*
+ * Calculate the total number of bits to be set.
+ */
+ bits_to_set = last_bit - first_bit + 1;
+
+ /*
+ * Get a pointer to the first word in the bitmap
+ * to set a bit in.
+ */
+ word_num = first_bit >> BIT_TO_WORD_SHIFT;
+ wordp = &(bip->bli_format.blf_data_map[word_num]);
+
+ /*
+ * Calculate the starting bit in the first word.
+ */
+ bit = first_bit & (uint)(NBWORD - 1);
+
+ /*
+ * First set any bits in the first word of our range.
+ * If it starts at bit 0 of the word, it will be
+ * set below rather than here. That is what the variable
+ * bit tells us. The variable bits_set tracks the number
+ * of bits that have been set so far. End_bit is the number
+ * of the last bit to be set in this word plus one.
+ */
+ if (bit) {
+ end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
+ mask = ((1 << (end_bit - bit)) - 1) << bit;
+ *wordp |= mask;
+ wordp++;
+ bits_set = end_bit - bit;
+ } else {
+ bits_set = 0;
+ }
+
+ /*
+ * Now set bits a whole word at a time that are between
+ * first_bit and last_bit.
+ */
+ while ((bits_to_set - bits_set) >= NBWORD) {
+ *wordp |= 0xffffffff;
+ bits_set += NBWORD;
+ wordp++;
+ }
+
+ /*
+ * Finally, set any bits left to be set in one last partial word.
+ */
+ end_bit = bits_to_set - bits_set;
+ if (end_bit) {
+ mask = (1 << end_bit) - 1;
+ *wordp |= mask;
+ }
+
+ xfs_buf_item_log_debug(bip, first, last);
+}
+
+
+/*
+ * Return 1 if the buffer has some data that has been logged (at any
+ * point, not just the current transaction) and 0 if not.
+ */
+uint
+xfs_buf_item_dirty(
+ xfs_buf_log_item_t *bip)
+{
+ return (bip->bli_flags & XFS_BLI_DIRTY);
+}
+
+STATIC void
+xfs_buf_item_free(
+ xfs_buf_log_item_t *bip)
+{
+#ifdef XFS_TRANS_DEBUG
+ kmem_free(bip->bli_orig);
+ kmem_free(bip->bli_logged);
+#endif /* XFS_TRANS_DEBUG */
+
+ kmem_zone_free(xfs_buf_item_zone, bip);
+}
+
+/*
+ * This is called when the buf log item is no longer needed. It should
+ * free the buf log item associated with the given buffer and clear
+ * the buffer's pointer to the buf log item. If there are no more
+ * items in the list, clear the b_iodone field of the buffer (see
+ * xfs_buf_attach_iodone() below).
+ */
+void
+xfs_buf_item_relse(
+ xfs_buf_t *bp)
+{
+ xfs_buf_log_item_t *bip;
+
+ trace_xfs_buf_item_relse(bp, _RET_IP_);
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+ XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
+ if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
+ (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
+ XFS_BUF_CLR_IODONE_FUNC(bp);
+ }
+ xfs_buf_rele(bp);
+ xfs_buf_item_free(bip);
+}
+
+
+/*
+ * Add the given log item with its callback to the list of callbacks
+ * to be called when the buffer's I/O completes. If it is not set
+ * already, set the buffer's b_iodone() routine to be
+ * xfs_buf_iodone_callbacks() and link the log item into the list of
+ * items rooted at b_fsprivate. Items are always added as the second
+ * entry in the list if there is a first, because the buf item code
+ * assumes that the buf log item is first.
+ */
+void
+xfs_buf_attach_iodone(
+ xfs_buf_t *bp,
+ void (*cb)(xfs_buf_t *, xfs_log_item_t *),
+ xfs_log_item_t *lip)
+{
+ xfs_log_item_t *head_lip;
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+ lip->li_cb = cb;
+ if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+ head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ lip->li_bio_list = head_lip->li_bio_list;
+ head_lip->li_bio_list = lip;
+ } else {
+ XFS_BUF_SET_FSPRIVATE(bp, lip);
+ }
+
+ ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) ||
+ (XFS_BUF_IODONE_FUNC(bp) == NULL));
+ XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
+}
+
+/*
+ * We can have many callbacks on a buffer. Running the callbacks individually
+ * can cause a lot of contention on the AIL lock, so we allow for a single
+ * callback to be able to scan the remaining lip->li_bio_list for other items
+ * of the same type and callback to be processed in the first call.
+ *
+ * As a result, the loop walking the callback list below will also modify the
+ * list. it removes the first item from the list and then runs the callback.
+ * The loop then restarts from the new head of the list. This allows the
+ * callback to scan and modify the list attached to the buffer and we don't
+ * have to care about maintaining a next item pointer.
+ */
+STATIC void
+xfs_buf_do_callbacks(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip;
+
+ while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
+ XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
+ ASSERT(lip->li_cb != NULL);
+ /*
+ * Clear the next pointer so we don't have any
+ * confusion if the item is added to another buf.
+ * Don't touch the log item after calling its
+ * callback, because it could have freed itself.
+ */
+ lip->li_bio_list = NULL;
+ lip->li_cb(bp, lip);
+ }
+}
+
+/*
+ * This is the iodone() function for buffers which have had callbacks
+ * attached to them by xfs_buf_attach_iodone(). It should remove each
+ * log item from the buffer's list and call the callback of each in turn.
+ * When done, the buffer's fsprivate field is set to NULL and the buffer
+ * is unlocked with a call to iodone().
+ */
+void
+xfs_buf_iodone_callbacks(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip = bp->b_fspriv;
+ struct xfs_mount *mp = lip->li_mountp;
+ static ulong lasttime;
+ static xfs_buftarg_t *lasttarg;
+
+ if (likely(!XFS_BUF_GETERROR(bp)))
+ goto do_callbacks;
+
+ /*
+ * If we've already decided to shutdown the filesystem because of
+ * I/O errors, there's no point in giving this a retry.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ XFS_BUF_SUPER_STALE(bp);
+ trace_xfs_buf_item_iodone(bp, _RET_IP_);
+ goto do_callbacks;
+ }
+
+ if (XFS_BUF_TARGET(bp) != lasttarg ||
+ time_after(jiffies, (lasttime + 5*HZ))) {
+ lasttime = jiffies;
+ xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
+ XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+ (__uint64_t)XFS_BUF_ADDR(bp));
+ }
+ lasttarg = XFS_BUF_TARGET(bp);
+
+ /*
+ * If the write was asynchronous then no one will be looking for the
+ * error. Clear the error state and write the buffer out again.
+ *
+ * During sync or umount we'll write all pending buffers again
+ * synchronous, which will catch these errors if they keep hanging
+ * around.
+ */
+ if (XFS_BUF_ISASYNC(bp)) {
+ XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */
+
+ if (!XFS_BUF_ISSTALE(bp)) {
+ XFS_BUF_DELAYWRITE(bp);
+ XFS_BUF_DONE(bp);
+ XFS_BUF_SET_START(bp);
+ }
+ ASSERT(XFS_BUF_IODONE_FUNC(bp));
+ trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
+ xfs_buf_relse(bp);
+ return;
+ }
+
+ /*
+ * If the write of the buffer was synchronous, we want to make
+ * sure to return the error to the caller of xfs_bwrite().
+ */
+ XFS_BUF_STALE(bp);
+ XFS_BUF_DONE(bp);
+ XFS_BUF_UNDELAYWRITE(bp);
+
+ trace_xfs_buf_error_relse(bp, _RET_IP_);
+
+do_callbacks:
+ xfs_buf_do_callbacks(bp);
+ XFS_BUF_SET_FSPRIVATE(bp, NULL);
+ XFS_BUF_CLR_IODONE_FUNC(bp);
+ xfs_buf_ioend(bp, 0);
+}
+
+/*
+ * This is the iodone() function for buffers which have been
+ * logged. It is called when they are eventually flushed out.
+ * It should remove the buf item from the AIL, and free the buf item.
+ * It is called by xfs_buf_iodone_callbacks() above which will take
+ * care of cleaning up the buffer itself.
+ */
+void
+xfs_buf_iodone(
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_ail *ailp = lip->li_ailp;
+
+ ASSERT(BUF_ITEM(lip)->bli_buf == bp);
+
+ xfs_buf_rele(bp);
+
+ /*
+ * If we are forcibly shutting down, this may well be
+ * off the AIL already. That's because we simulate the
+ * log-committed callbacks to unpin these buffers. Or we may never
+ * have put this item on AIL because of the transaction was
+ * aborted forcibly. xfs_trans_ail_delete() takes care of these.
+ *
+ * Either way, AIL is useless if we're forcing a shutdown.
+ */
+ spin_lock(&ailp->xa_lock);
+ xfs_trans_ail_delete(ailp, lip);
+ xfs_buf_item_free(BUF_ITEM(lip));
+}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
new file mode 100644
index 00000000..b6ecd206
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_BUF_ITEM_H__
+#define __XFS_BUF_ITEM_H__
+
+extern kmem_zone_t *xfs_buf_item_zone;
+
+/*
+ * This is the structure used to lay out a buf log item in the
+ * log. The data map describes which 128 byte chunks of the buffer
+ * have been logged.
+ * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything.
+ */
+typedef struct xfs_buf_log_format {
+ unsigned short blf_type; /* buf log item type indicator */
+ unsigned short blf_size; /* size of this item */
+ ushort blf_flags; /* misc state */
+ ushort blf_len; /* number of blocks in this buf */
+ __int64_t blf_blkno; /* starting blkno of this buf */
+ unsigned int blf_map_size; /* size of data bitmap in words */
+ unsigned int blf_data_map[1];/* variable size bitmap of */
+ /* regions of buffer in this item */
+} xfs_buf_log_format_t;
+
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define XFS_BLF_INODE_BUF 0x1
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define XFS_BLF_CANCEL 0x2
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define XFS_BLF_UDQUOT_BUF 0x4
+#define XFS_BLF_PDQUOT_BUF 0x8
+#define XFS_BLF_GDQUOT_BUF 0x10
+
+#define XFS_BLF_CHUNK 128
+#define XFS_BLF_SHIFT 7
+#define BIT_TO_WORD_SHIFT 5
+#define NBWORD (NBBY * sizeof(unsigned int))
+
+/*
+ * buf log item flags
+ */
+#define XFS_BLI_HOLD 0x01
+#define XFS_BLI_DIRTY 0x02
+#define XFS_BLI_STALE 0x04
+#define XFS_BLI_LOGGED 0x08
+#define XFS_BLI_INODE_ALLOC_BUF 0x10
+#define XFS_BLI_STALE_INODE 0x20
+#define XFS_BLI_INODE_BUF 0x40
+
+#define XFS_BLI_FLAGS \
+ { XFS_BLI_HOLD, "HOLD" }, \
+ { XFS_BLI_DIRTY, "DIRTY" }, \
+ { XFS_BLI_STALE, "STALE" }, \
+ { XFS_BLI_LOGGED, "LOGGED" }, \
+ { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
+ { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
+ { XFS_BLI_INODE_BUF, "INODE_BUF" }
+
+
+#ifdef __KERNEL__
+
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_buf_log_item;
+
+/*
+ * This is the in core log item structure used to track information
+ * needed to log buffers. It tracks how many times the lock has been
+ * locked, and which 128 byte chunks of the buffer are dirty.
+ */
+typedef struct xfs_buf_log_item {
+ xfs_log_item_t bli_item; /* common item structure */
+ struct xfs_buf *bli_buf; /* real buffer pointer */
+ unsigned int bli_flags; /* misc flags */
+ unsigned int bli_recur; /* lock recursion count */
+ atomic_t bli_refcount; /* cnt of tp refs */
+#ifdef XFS_TRANS_DEBUG
+ char *bli_orig; /* original buffer copy */
+ char *bli_logged; /* bytes logged (bitmap) */
+#endif
+ xfs_buf_log_format_t bli_format; /* in-log header */
+} xfs_buf_log_item_t;
+
+void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+void xfs_buf_item_relse(struct xfs_buf *);
+void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
+void xfs_buf_attach_iodone(struct xfs_buf *,
+ void(*)(struct xfs_buf *, xfs_log_item_t *),
+ xfs_log_item_t *);
+void xfs_buf_iodone_callbacks(struct xfs_buf *);
+void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
+
+#ifdef XFS_TRANS_DEBUG
+void
+xfs_buf_item_flush_log_debug(
+ struct xfs_buf *bp,
+ uint first,
+ uint last);
+#else
+#define xfs_buf_item_flush_log_debug(bp, first, last)
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
new file mode 100644
index 00000000..6102ac6d
--- /dev/null
+++ b/fs/xfs/xfs_da_btree.c
@@ -0,0 +1,2463 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * xfs_da_btree.c
+ *
+ * Routines to implement directories as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_da_root_split(xfs_da_state_t *state,
+ xfs_da_state_blk_t *existing_root,
+ xfs_da_state_blk_t *new_child);
+STATIC int xfs_da_node_split(xfs_da_state_t *state,
+ xfs_da_state_blk_t *existing_blk,
+ xfs_da_state_blk_t *split_blk,
+ xfs_da_state_blk_t *blk_to_add,
+ int treelevel,
+ int *result);
+STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *node_blk_1,
+ xfs_da_state_blk_t *node_blk_2);
+STATIC void xfs_da_node_add(xfs_da_state_t *state,
+ xfs_da_state_blk_t *old_node_blk,
+ xfs_da_state_blk_t *new_node_blk);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_da_root_join(xfs_da_state_t *state,
+ xfs_da_state_blk_t *root_blk);
+STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da_node_remove(xfs_da_state_t *state,
+ xfs_da_state_blk_t *drop_blk);
+STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *src_node_blk,
+ xfs_da_state_blk_t *dst_node_blk);
+
+/*
+ * Utility routines.
+ */
+STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count);
+STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp);
+STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra);
+STATIC int xfs_da_blk_unlink(xfs_da_state_t *state,
+ xfs_da_state_blk_t *drop_blk,
+ xfs_da_state_blk_t *save_blk);
+STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of an intermediate node.
+ */
+int
+xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
+ xfs_dabuf_t **bpp, int whichfork)
+{
+ xfs_da_intnode_t *node;
+ xfs_dabuf_t *bp;
+ int error;
+ xfs_trans_t *tp;
+
+ tp = args->trans;
+ error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+ node = bp->data;
+ node->hdr.info.forw = 0;
+ node->hdr.info.back = 0;
+ node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC);
+ node->hdr.info.pad = 0;
+ node->hdr.count = 0;
+ node->hdr.level = cpu_to_be16(level);
+
+ xfs_da_log_buf(tp, bp,
+ XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+ *bpp = bp;
+ return(0);
+}
+
+/*
+ * Split a leaf node, rebalance, then possibly split
+ * intermediate nodes, rebalance, etc.
+ */
+int /* error */
+xfs_da_split(xfs_da_state_t *state)
+{
+ xfs_da_state_blk_t *oldblk, *newblk, *addblk;
+ xfs_da_intnode_t *node;
+ xfs_dabuf_t *bp;
+ int max, action, error, i;
+
+ /*
+ * Walk back up the tree splitting/inserting/adjusting as necessary.
+ * If we need to insert and there isn't room, split the node, then
+ * decide which fragment to insert the new block from below into.
+ * Note that we may split the root this way, but we need more fixup.
+ */
+ max = state->path.active - 1;
+ ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
+ ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
+ state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
+ addblk = &state->path.blk[max]; /* initial dummy value */
+ for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
+ oldblk = &state->path.blk[i];
+ newblk = &state->altpath.blk[i];
+
+ /*
+ * If a leaf node then
+ * Allocate a new leaf node, then rebalance across them.
+ * else if an intermediate node then
+ * We split on the last layer, must we split the node?
+ */
+ switch (oldblk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ error = xfs_attr_leaf_split(state, oldblk, newblk);
+ if ((error != 0) && (error != ENOSPC)) {
+ return(error); /* GROT: attr is inconsistent */
+ }
+ if (!error) {
+ addblk = newblk;
+ break;
+ }
+ /*
+ * Entry wouldn't fit, split the leaf again.
+ */
+ state->extravalid = 1;
+ if (state->inleaf) {
+ state->extraafter = 0; /* before newblk */
+ error = xfs_attr_leaf_split(state, oldblk,
+ &state->extrablk);
+ } else {
+ state->extraafter = 1; /* after newblk */
+ error = xfs_attr_leaf_split(state, newblk,
+ &state->extrablk);
+ }
+ if (error)
+ return(error); /* GROT: attr inconsistent */
+ addblk = newblk;
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ error = xfs_dir2_leafn_split(state, oldblk, newblk);
+ if (error)
+ return error;
+ addblk = newblk;
+ break;
+ case XFS_DA_NODE_MAGIC:
+ error = xfs_da_node_split(state, oldblk, newblk, addblk,
+ max - i, &action);
+ xfs_da_buf_done(addblk->bp);
+ addblk->bp = NULL;
+ if (error)
+ return(error); /* GROT: dir is inconsistent */
+ /*
+ * Record the newly split block for the next time thru?
+ */
+ if (action)
+ addblk = newblk;
+ else
+ addblk = NULL;
+ break;
+ }
+
+ /*
+ * Update the btree to show the new hashval for this child.
+ */
+ xfs_da_fixhashpath(state, &state->path);
+ /*
+ * If we won't need this block again, it's getting dropped
+ * from the active path by the loop control, so we need
+ * to mark it done now.
+ */
+ if (i > 0 || !addblk)
+ xfs_da_buf_done(oldblk->bp);
+ }
+ if (!addblk)
+ return(0);
+
+ /*
+ * Split the root node.
+ */
+ ASSERT(state->path.active == 0);
+ oldblk = &state->path.blk[0];
+ error = xfs_da_root_split(state, oldblk, addblk);
+ if (error) {
+ xfs_da_buf_done(oldblk->bp);
+ xfs_da_buf_done(addblk->bp);
+ addblk->bp = NULL;
+ return(error); /* GROT: dir is inconsistent */
+ }
+
+ /*
+ * Update pointers to the node which used to be block 0 and
+ * just got bumped because of the addition of a new root node.
+ * There might be three blocks involved if a double split occurred,
+ * and the original block 0 could be at any position in the list.
+ */
+
+ node = oldblk->bp->data;
+ if (node->hdr.info.forw) {
+ if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
+ bp = addblk->bp;
+ } else {
+ ASSERT(state->extravalid);
+ bp = state->extrablk.bp;
+ }
+ node = bp->data;
+ node->hdr.info.back = cpu_to_be32(oldblk->blkno);
+ xfs_da_log_buf(state->args->trans, bp,
+ XFS_DA_LOGRANGE(node, &node->hdr.info,
+ sizeof(node->hdr.info)));
+ }
+ node = oldblk->bp->data;
+ if (node->hdr.info.back) {
+ if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
+ bp = addblk->bp;
+ } else {
+ ASSERT(state->extravalid);
+ bp = state->extrablk.bp;
+ }
+ node = bp->data;
+ node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
+ xfs_da_log_buf(state->args->trans, bp,
+ XFS_DA_LOGRANGE(node, &node->hdr.info,
+ sizeof(node->hdr.info)));
+ }
+ xfs_da_buf_done(oldblk->bp);
+ xfs_da_buf_done(addblk->bp);
+ addblk->bp = NULL;
+ return(0);
+}
+
+/*
+ * Split the root. We have to create a new root and point to the two
+ * parts (the split old root) that we just created. Copy block zero to
+ * the EOF, extending the inode in process.
+ */
+STATIC int /* error */
+xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+ xfs_da_state_blk_t *blk2)
+{
+ xfs_da_intnode_t *node, *oldroot;
+ xfs_da_args_t *args;
+ xfs_dablk_t blkno;
+ xfs_dabuf_t *bp;
+ int error, size;
+ xfs_inode_t *dp;
+ xfs_trans_t *tp;
+ xfs_mount_t *mp;
+ xfs_dir2_leaf_t *leaf;
+
+ /*
+ * Copy the existing (incorrect) block from the root node position
+ * to a free space somewhere.
+ */
+ args = state->args;
+ ASSERT(args != NULL);
+ error = xfs_da_grow_inode(args, &blkno);
+ if (error)
+ return(error);
+ dp = args->dp;
+ tp = args->trans;
+ mp = state->mp;
+ error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+ node = bp->data;
+ oldroot = blk1->bp->data;
+ if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC) {
+ size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] -
+ (char *)oldroot);
+ } else {
+ ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ leaf = (xfs_dir2_leaf_t *)oldroot;
+ size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] -
+ (char *)leaf);
+ }
+ memcpy(node, oldroot, size);
+ xfs_da_log_buf(tp, bp, 0, size - 1);
+ xfs_da_buf_done(blk1->bp);
+ blk1->bp = bp;
+ blk1->blkno = blkno;
+
+ /*
+ * Set up the new root node.
+ */
+ error = xfs_da_node_create(args,
+ (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
+ be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork);
+ if (error)
+ return(error);
+ node = bp->data;
+ node->btree[0].hashval = cpu_to_be32(blk1->hashval);
+ node->btree[0].before = cpu_to_be32(blk1->blkno);
+ node->btree[1].hashval = cpu_to_be32(blk2->hashval);
+ node->btree[1].before = cpu_to_be32(blk2->blkno);
+ node->hdr.count = cpu_to_be16(2);
+
+#ifdef DEBUG
+ if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC) {
+ ASSERT(blk1->blkno >= mp->m_dirleafblk &&
+ blk1->blkno < mp->m_dirfreeblk);
+ ASSERT(blk2->blkno >= mp->m_dirleafblk &&
+ blk2->blkno < mp->m_dirfreeblk);
+ }
+#endif
+
+ /* Header is already logged by xfs_da_node_create */
+ xfs_da_log_buf(tp, bp,
+ XFS_DA_LOGRANGE(node, node->btree,
+ sizeof(xfs_da_node_entry_t) * 2));
+ xfs_da_buf_done(bp);
+
+ return(0);
+}
+
+/*
+ * Split the node, rebalance, then add the new entry.
+ */
+STATIC int /* error */
+xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+ xfs_da_state_blk_t *newblk,
+ xfs_da_state_blk_t *addblk,
+ int treelevel, int *result)
+{
+ xfs_da_intnode_t *node;
+ xfs_dablk_t blkno;
+ int newcount, error;
+ int useextra;
+
+ node = oldblk->bp->data;
+ ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+
+ /*
+ * With V2 dirs the extra block is data or freespace.
+ */
+ useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
+ newcount = 1 + useextra;
+ /*
+ * Do we have to split the node?
+ */
+ if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) {
+ /*
+ * Allocate a new node, add to the doubly linked chain of
+ * nodes, then move some of our excess entries into it.
+ */
+ error = xfs_da_grow_inode(state->args, &blkno);
+ if (error)
+ return(error); /* GROT: dir is inconsistent */
+
+ error = xfs_da_node_create(state->args, blkno, treelevel,
+ &newblk->bp, state->args->whichfork);
+ if (error)
+ return(error); /* GROT: dir is inconsistent */
+ newblk->blkno = blkno;
+ newblk->magic = XFS_DA_NODE_MAGIC;
+ xfs_da_node_rebalance(state, oldblk, newblk);
+ error = xfs_da_blk_link(state, oldblk, newblk);
+ if (error)
+ return(error);
+ *result = 1;
+ } else {
+ *result = 0;
+ }
+
+ /*
+ * Insert the new entry(s) into the correct block
+ * (updating last hashval in the process).
+ *
+ * xfs_da_node_add() inserts BEFORE the given index,
+ * and as a result of using node_lookup_int() we always
+ * point to a valid entry (not after one), but a split
+ * operation always results in a new block whose hashvals
+ * FOLLOW the current block.
+ *
+ * If we had double-split op below us, then add the extra block too.
+ */
+ node = oldblk->bp->data;
+ if (oldblk->index <= be16_to_cpu(node->hdr.count)) {
+ oldblk->index++;
+ xfs_da_node_add(state, oldblk, addblk);
+ if (useextra) {
+ if (state->extraafter)
+ oldblk->index++;
+ xfs_da_node_add(state, oldblk, &state->extrablk);
+ state->extravalid = 0;
+ }
+ } else {
+ newblk->index++;
+ xfs_da_node_add(state, newblk, addblk);
+ if (useextra) {
+ if (state->extraafter)
+ newblk->index++;
+ xfs_da_node_add(state, newblk, &state->extrablk);
+ state->extravalid = 0;
+ }
+ }
+
+ return(0);
+}
+
+/*
+ * Balance the btree elements between two intermediate nodes,
+ * usually one full and one empty.
+ *
+ * NOTE: if blk2 is empty, then it will get the upper half of blk1.
+ */
+STATIC void
+xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+ xfs_da_state_blk_t *blk2)
+{
+ xfs_da_intnode_t *node1, *node2, *tmpnode;
+ xfs_da_node_entry_t *btree_s, *btree_d;
+ int count, tmp;
+ xfs_trans_t *tp;
+
+ node1 = blk1->bp->data;
+ node2 = blk2->bp->data;
+ /*
+ * Figure out how many entries need to move, and in which direction.
+ * Swap the nodes around if that makes it simpler.
+ */
+ if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
+ ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) ||
+ (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
+ be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
+ tmpnode = node1;
+ node1 = node2;
+ node2 = tmpnode;
+ }
+ ASSERT(be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+ ASSERT(be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+ count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2;
+ if (count == 0)
+ return;
+ tp = state->args->trans;
+ /*
+ * Two cases: high-to-low and low-to-high.
+ */
+ if (count > 0) {
+ /*
+ * Move elements in node2 up to make a hole.
+ */
+ if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) {
+ tmp *= (uint)sizeof(xfs_da_node_entry_t);
+ btree_s = &node2->btree[0];
+ btree_d = &node2->btree[count];
+ memmove(btree_d, btree_s, tmp);
+ }
+
+ /*
+ * Move the req'd B-tree elements from high in node1 to
+ * low in node2.
+ */
+ be16_add_cpu(&node2->hdr.count, count);
+ tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+ btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count];
+ btree_d = &node2->btree[0];
+ memcpy(btree_d, btree_s, tmp);
+ be16_add_cpu(&node1->hdr.count, -count);
+ } else {
+ /*
+ * Move the req'd B-tree elements from low in node2 to
+ * high in node1.
+ */
+ count = -count;
+ tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+ btree_s = &node2->btree[0];
+ btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)];
+ memcpy(btree_d, btree_s, tmp);
+ be16_add_cpu(&node1->hdr.count, count);
+ xfs_da_log_buf(tp, blk1->bp,
+ XFS_DA_LOGRANGE(node1, btree_d, tmp));
+
+ /*
+ * Move elements in node2 down to fill the hole.
+ */
+ tmp = be16_to_cpu(node2->hdr.count) - count;
+ tmp *= (uint)sizeof(xfs_da_node_entry_t);
+ btree_s = &node2->btree[count];
+ btree_d = &node2->btree[0];
+ memmove(btree_d, btree_s, tmp);
+ be16_add_cpu(&node2->hdr.count, -count);
+ }
+
+ /*
+ * Log header of node 1 and all current bits of node 2.
+ */
+ xfs_da_log_buf(tp, blk1->bp,
+ XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
+ xfs_da_log_buf(tp, blk2->bp,
+ XFS_DA_LOGRANGE(node2, &node2->hdr,
+ sizeof(node2->hdr) +
+ sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count)));
+
+ /*
+ * Record the last hashval from each block for upward propagation.
+ * (note: don't use the swapped node pointers)
+ */
+ node1 = blk1->bp->data;
+ node2 = blk2->bp->data;
+ blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval);
+ blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval);
+
+ /*
+ * Adjust the expected index for insertion.
+ */
+ if (blk1->index >= be16_to_cpu(node1->hdr.count)) {
+ blk2->index = blk1->index - be16_to_cpu(node1->hdr.count);
+ blk1->index = be16_to_cpu(node1->hdr.count) + 1; /* make it invalid */
+ }
+}
+
+/*
+ * Add a new entry to an intermediate node.
+ */
+STATIC void
+xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+ xfs_da_state_blk_t *newblk)
+{
+ xfs_da_intnode_t *node;
+ xfs_da_node_entry_t *btree;
+ int tmp;
+
+ node = oldblk->bp->data;
+ ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+ ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
+ ASSERT(newblk->blkno != 0);
+ if (state->args->whichfork == XFS_DATA_FORK)
+ ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
+ newblk->blkno < state->mp->m_dirfreeblk);
+
+ /*
+ * We may need to make some room before we insert the new node.
+ */
+ tmp = 0;
+ btree = &node->btree[ oldblk->index ];
+ if (oldblk->index < be16_to_cpu(node->hdr.count)) {
+ tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree);
+ memmove(btree + 1, btree, tmp);
+ }
+ btree->hashval = cpu_to_be32(newblk->hashval);
+ btree->before = cpu_to_be32(newblk->blkno);
+ xfs_da_log_buf(state->args->trans, oldblk->bp,
+ XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
+ be16_add_cpu(&node->hdr.count, 1);
+ xfs_da_log_buf(state->args->trans, oldblk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+ /*
+ * Copy the last hash value from the oldblk to propagate upwards.
+ */
+ oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Deallocate an empty leaf node, remove it from its parent,
+ * possibly deallocating that block, etc...
+ */
+int
+xfs_da_join(xfs_da_state_t *state)
+{
+ xfs_da_state_blk_t *drop_blk, *save_blk;
+ int action, error;
+
+ action = 0;
+ drop_blk = &state->path.blk[ state->path.active-1 ];
+ save_blk = &state->altpath.blk[ state->path.active-1 ];
+ ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
+ ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
+ drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+
+ /*
+ * Walk back up the tree joining/deallocating as necessary.
+ * When we stop dropping blocks, break out.
+ */
+ for ( ; state->path.active >= 2; drop_blk--, save_blk--,
+ state->path.active--) {
+ /*
+ * See if we can combine the block with a neighbor.
+ * (action == 0) => no options, just leave
+ * (action == 1) => coalesce, then unlink
+ * (action == 2) => block empty, unlink it
+ */
+ switch (drop_blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ error = xfs_attr_leaf_toosmall(state, &action);
+ if (error)
+ return(error);
+ if (action == 0)
+ return(0);
+ xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ error = xfs_dir2_leafn_toosmall(state, &action);
+ if (error)
+ return error;
+ if (action == 0)
+ return 0;
+ xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
+ break;
+ case XFS_DA_NODE_MAGIC:
+ /*
+ * Remove the offending node, fixup hashvals,
+ * check for a toosmall neighbor.
+ */
+ xfs_da_node_remove(state, drop_blk);
+ xfs_da_fixhashpath(state, &state->path);
+ error = xfs_da_node_toosmall(state, &action);
+ if (error)
+ return(error);
+ if (action == 0)
+ return 0;
+ xfs_da_node_unbalance(state, drop_blk, save_blk);
+ break;
+ }
+ xfs_da_fixhashpath(state, &state->altpath);
+ error = xfs_da_blk_unlink(state, drop_blk, save_blk);
+ xfs_da_state_kill_altpath(state);
+ if (error)
+ return(error);
+ error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
+ drop_blk->bp);
+ drop_blk->bp = NULL;
+ if (error)
+ return(error);
+ }
+ /*
+ * We joined all the way to the top. If it turns out that
+ * we only have one entry in the root, make the child block
+ * the new root.
+ */
+ xfs_da_node_remove(state, drop_blk);
+ xfs_da_fixhashpath(state, &state->path);
+ error = xfs_da_root_join(state, &state->path.blk[0]);
+ return(error);
+}
+
+/*
+ * We have only one entry in the root. Copy the only remaining child of
+ * the old root to block 0 as the new root node.
+ */
+STATIC int
+xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
+{
+ xfs_da_intnode_t *oldroot;
+ /* REFERENCED */
+ xfs_da_blkinfo_t *blkinfo;
+ xfs_da_args_t *args;
+ xfs_dablk_t child;
+ xfs_dabuf_t *bp;
+ int error;
+
+ args = state->args;
+ ASSERT(args != NULL);
+ ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+ oldroot = root_blk->bp->data;
+ ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+ ASSERT(!oldroot->hdr.info.forw);
+ ASSERT(!oldroot->hdr.info.back);
+
+ /*
+ * If the root has more than one child, then don't do anything.
+ */
+ if (be16_to_cpu(oldroot->hdr.count) > 1)
+ return(0);
+
+ /*
+ * Read in the (only) child block, then copy those bytes into
+ * the root block's buffer and free the original child block.
+ */
+ child = be32_to_cpu(oldroot->btree[0].before);
+ ASSERT(child != 0);
+ error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
+ args->whichfork);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+ blkinfo = bp->data;
+ if (be16_to_cpu(oldroot->hdr.level) == 1) {
+ ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIR2_LEAFN_MAGIC ||
+ be16_to_cpu(blkinfo->magic) == XFS_ATTR_LEAF_MAGIC);
+ } else {
+ ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DA_NODE_MAGIC);
+ }
+ ASSERT(!blkinfo->forw);
+ ASSERT(!blkinfo->back);
+ memcpy(root_blk->bp->data, bp->data, state->blocksize);
+ xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
+ error = xfs_da_shrink_inode(args, child, bp);
+ return(error);
+}
+
+/*
+ * Check a node block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor. Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+STATIC int
+xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
+{
+ xfs_da_intnode_t *node;
+ xfs_da_state_blk_t *blk;
+ xfs_da_blkinfo_t *info;
+ int count, forward, error, retval, i;
+ xfs_dablk_t blkno;
+ xfs_dabuf_t *bp;
+
+ /*
+ * Check for the degenerate case of the block being over 50% full.
+ * If so, it's not worth even looking to see if we might be able
+ * to coalesce with a sibling.
+ */
+ blk = &state->path.blk[ state->path.active-1 ];
+ info = blk->bp->data;
+ ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC);
+ node = (xfs_da_intnode_t *)info;
+ count = be16_to_cpu(node->hdr.count);
+ if (count > (state->node_ents >> 1)) {
+ *action = 0; /* blk over 50%, don't try to join */
+ return(0); /* blk over 50%, don't try to join */
+ }
+
+ /*
+ * Check for the degenerate case of the block being empty.
+ * If the block is empty, we'll simply delete it, no need to
+ * coalesce it with a sibling block. We choose (arbitrarily)
+ * to merge with the forward block unless it is NULL.
+ */
+ if (count == 0) {
+ /*
+ * Make altpath point to the block we want to keep and
+ * path point to the block we want to drop (this one).
+ */
+ forward = (info->forw != 0);
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ error = xfs_da_path_shift(state, &state->altpath, forward,
+ 0, &retval);
+ if (error)
+ return(error);
+ if (retval) {
+ *action = 0;
+ } else {
+ *action = 2;
+ }
+ return(0);
+ }
+
+ /*
+ * Examine each sibling block to see if we can coalesce with
+ * at least 25% free space to spare. We need to figure out
+ * whether to merge with the forward or the backward block.
+ * We prefer coalescing with the lower numbered sibling so as
+ * to shrink a directory over time.
+ */
+ /* start with smaller blk num */
+ forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
+ for (i = 0; i < 2; forward = !forward, i++) {
+ if (forward)
+ blkno = be32_to_cpu(info->forw);
+ else
+ blkno = be32_to_cpu(info->back);
+ if (blkno == 0)
+ continue;
+ error = xfs_da_read_buf(state->args->trans, state->args->dp,
+ blkno, -1, &bp, state->args->whichfork);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+
+ node = (xfs_da_intnode_t *)info;
+ count = state->node_ents;
+ count -= state->node_ents >> 2;
+ count -= be16_to_cpu(node->hdr.count);
+ node = bp->data;
+ ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+ count -= be16_to_cpu(node->hdr.count);
+ xfs_da_brelse(state->args->trans, bp);
+ if (count >= 0)
+ break; /* fits with at least 25% to spare */
+ }
+ if (i >= 2) {
+ *action = 0;
+ return(0);
+ }
+
+ /*
+ * Make altpath point to the block we want to keep (the lower
+ * numbered block) and path point to the block we want to drop.
+ */
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ if (blkno < blk->blkno) {
+ error = xfs_da_path_shift(state, &state->altpath, forward,
+ 0, &retval);
+ if (error) {
+ return(error);
+ }
+ if (retval) {
+ *action = 0;
+ return(0);
+ }
+ } else {
+ error = xfs_da_path_shift(state, &state->path, forward,
+ 0, &retval);
+ if (error) {
+ return(error);
+ }
+ if (retval) {
+ *action = 0;
+ return(0);
+ }
+ }
+ *action = 1;
+ return(0);
+}
+
+/*
+ * Walk back up the tree adjusting hash values as necessary,
+ * when we stop making changes, return.
+ */
+void
+xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
+{
+ xfs_da_state_blk_t *blk;
+ xfs_da_intnode_t *node;
+ xfs_da_node_entry_t *btree;
+ xfs_dahash_t lasthash=0;
+ int level, count;
+
+ level = path->active-1;
+ blk = &path->blk[ level ];
+ switch (blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
+ if (count == 0)
+ return;
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
+ if (count == 0)
+ return;
+ break;
+ case XFS_DA_NODE_MAGIC:
+ lasthash = xfs_da_node_lasthash(blk->bp, &count);
+ if (count == 0)
+ return;
+ break;
+ }
+ for (blk--, level--; level >= 0; blk--, level--) {
+ node = blk->bp->data;
+ ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+ btree = &node->btree[ blk->index ];
+ if (be32_to_cpu(btree->hashval) == lasthash)
+ break;
+ blk->hashval = lasthash;
+ btree->hashval = cpu_to_be32(lasthash);
+ xfs_da_log_buf(state->args->trans, blk->bp,
+ XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+
+ lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+ }
+}
+
+/*
+ * Remove an entry from an intermediate node.
+ */
+STATIC void
+xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
+{
+ xfs_da_intnode_t *node;
+ xfs_da_node_entry_t *btree;
+ int tmp;
+
+ node = drop_blk->bp->data;
+ ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
+ ASSERT(drop_blk->index >= 0);
+
+ /*
+ * Copy over the offending entry, or just zero it out.
+ */
+ btree = &node->btree[drop_blk->index];
+ if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) {
+ tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1;
+ tmp *= (uint)sizeof(xfs_da_node_entry_t);
+ memmove(btree, btree + 1, tmp);
+ xfs_da_log_buf(state->args->trans, drop_blk->bp,
+ XFS_DA_LOGRANGE(node, btree, tmp));
+ btree = &node->btree[be16_to_cpu(node->hdr.count)-1];
+ }
+ memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
+ xfs_da_log_buf(state->args->trans, drop_blk->bp,
+ XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+ be16_add_cpu(&node->hdr.count, -1);
+ xfs_da_log_buf(state->args->trans, drop_blk->bp,
+ XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+ /*
+ * Copy the last hash value from the block to propagate upwards.
+ */
+ btree--;
+ drop_blk->hashval = be32_to_cpu(btree->hashval);
+}
+
+/*
+ * Unbalance the btree elements between two intermediate nodes,
+ * move all Btree elements from one node into another.
+ */
+STATIC void
+xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+ xfs_da_state_blk_t *save_blk)
+{
+ xfs_da_intnode_t *drop_node, *save_node;
+ xfs_da_node_entry_t *btree;
+ int tmp;
+ xfs_trans_t *tp;
+
+ drop_node = drop_blk->bp->data;
+ save_node = save_blk->bp->data;
+ ASSERT(be16_to_cpu(drop_node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+ ASSERT(be16_to_cpu(save_node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+ tp = state->args->trans;
+
+ /*
+ * If the dying block has lower hashvals, then move all the
+ * elements in the remaining block up to make a hole.
+ */
+ if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) ||
+ (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) <
+ be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval)))
+ {
+ btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)];
+ tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
+ memmove(btree, &save_node->btree[0], tmp);
+ btree = &save_node->btree[0];
+ xfs_da_log_buf(tp, save_blk->bp,
+ XFS_DA_LOGRANGE(save_node, btree,
+ (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) *
+ sizeof(xfs_da_node_entry_t)));
+ } else {
+ btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)];
+ xfs_da_log_buf(tp, save_blk->bp,
+ XFS_DA_LOGRANGE(save_node, btree,
+ be16_to_cpu(drop_node->hdr.count) *
+ sizeof(xfs_da_node_entry_t)));
+ }
+
+ /*
+ * Move all the B-tree elements from drop_blk to save_blk.
+ */
+ tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
+ memcpy(btree, &drop_node->btree[0], tmp);
+ be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count));
+
+ xfs_da_log_buf(tp, save_blk->bp,
+ XFS_DA_LOGRANGE(save_node, &save_node->hdr,
+ sizeof(save_node->hdr)));
+
+ /*
+ * Save the last hashval in the remaining block for upward propagation.
+ */
+ save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Walk down the Btree looking for a particular filename, filling
+ * in the state structure as we go.
+ *
+ * We will set the state structure to point to each of the elements
+ * in each of the nodes where either the hashval is or should be.
+ *
+ * We support duplicate hashval's so for each entry in the current
+ * node that could contain the desired hashval, descend. This is a
+ * pruned depth-first tree search.
+ */
+int /* error */
+xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
+{
+ xfs_da_state_blk_t *blk;
+ xfs_da_blkinfo_t *curr;
+ xfs_da_intnode_t *node;
+ xfs_da_node_entry_t *btree;
+ xfs_dablk_t blkno;
+ int probe, span, max, error, retval;
+ xfs_dahash_t hashval, btreehashval;
+ xfs_da_args_t *args;
+
+ args = state->args;
+
+ /*
+ * Descend thru the B-tree searching each level for the right
+ * node to use, until the right hashval is found.
+ */
+ blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0;
+ for (blk = &state->path.blk[0], state->path.active = 1;
+ state->path.active <= XFS_DA_NODE_MAXDEPTH;
+ blk++, state->path.active++) {
+ /*
+ * Read the next node down in the tree.
+ */
+ blk->blkno = blkno;
+ error = xfs_da_read_buf(args->trans, args->dp, blkno,
+ -1, &blk->bp, args->whichfork);
+ if (error) {
+ blk->blkno = 0;
+ state->path.active--;
+ return(error);
+ }
+ curr = blk->bp->data;
+ blk->magic = be16_to_cpu(curr->magic);
+ ASSERT(blk->magic == XFS_DA_NODE_MAGIC ||
+ blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+ blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+ /*
+ * Search an intermediate node for a match.
+ */
+ if (blk->magic == XFS_DA_NODE_MAGIC) {
+ node = blk->bp->data;
+ max = be16_to_cpu(node->hdr.count);
+ blk->hashval = be32_to_cpu(node->btree[max-1].hashval);
+
+ /*
+ * Binary search. (note: small blocks will skip loop)
+ */
+ probe = span = max / 2;
+ hashval = args->hashval;
+ for (btree = &node->btree[probe]; span > 4;
+ btree = &node->btree[probe]) {
+ span /= 2;
+ btreehashval = be32_to_cpu(btree->hashval);
+ if (btreehashval < hashval)
+ probe += span;
+ else if (btreehashval > hashval)
+ probe -= span;
+ else
+ break;
+ }
+ ASSERT((probe >= 0) && (probe < max));
+ ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval));
+
+ /*
+ * Since we may have duplicate hashval's, find the first
+ * matching hashval in the node.
+ */
+ while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) {
+ btree--;
+ probe--;
+ }
+ while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) {
+ btree++;
+ probe++;
+ }
+
+ /*
+ * Pick the right block to descend on.
+ */
+ if (probe == max) {
+ blk->index = max-1;
+ blkno = be32_to_cpu(node->btree[max-1].before);
+ } else {
+ blk->index = probe;
+ blkno = be32_to_cpu(btree->before);
+ }
+ } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+ break;
+ } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+ blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
+ break;
+ }
+ }
+
+ /*
+ * A leaf block that ends in the hashval that we are interested in
+ * (final hashval == search hashval) means that the next block may
+ * contain more entries with the same hashval, shift upward to the
+ * next leaf and keep searching.
+ */
+ for (;;) {
+ if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+ retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
+ &blk->index, state);
+ } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+ retval = xfs_attr_leaf_lookup_int(blk->bp, args);
+ blk->index = args->index;
+ args->blkno = blk->blkno;
+ } else {
+ ASSERT(0);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ if (((retval == ENOENT) || (retval == ENOATTR)) &&
+ (blk->hashval == args->hashval)) {
+ error = xfs_da_path_shift(state, &state->path, 1, 1,
+ &retval);
+ if (error)
+ return(error);
+ if (retval == 0) {
+ continue;
+ } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+ /* path_shift() gives ENOENT */
+ retval = XFS_ERROR(ENOATTR);
+ }
+ }
+ break;
+ }
+ *result = retval;
+ return(0);
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Link a new block into a doubly linked list of blocks (of whatever type).
+ */
+int /* error */
+xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+ xfs_da_state_blk_t *new_blk)
+{
+ xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
+ xfs_da_args_t *args;
+ int before=0, error;
+ xfs_dabuf_t *bp;
+
+ /*
+ * Set up environment.
+ */
+ args = state->args;
+ ASSERT(args != NULL);
+ old_info = old_blk->bp->data;
+ new_info = new_blk->bp->data;
+ ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
+ old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+ old_blk->magic == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(old_blk->magic == be16_to_cpu(old_info->magic));
+ ASSERT(new_blk->magic == be16_to_cpu(new_info->magic));
+ ASSERT(old_blk->magic == new_blk->magic);
+
+ switch (old_blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
+ break;
+ case XFS_DA_NODE_MAGIC:
+ before = xfs_da_node_order(old_blk->bp, new_blk->bp);
+ break;
+ }
+
+ /*
+ * Link blocks in appropriate order.
+ */
+ if (before) {
+ /*
+ * Link new block in before existing block.
+ */
+ new_info->forw = cpu_to_be32(old_blk->blkno);
+ new_info->back = old_info->back;
+ if (old_info->back) {
+ error = xfs_da_read_buf(args->trans, args->dp,
+ be32_to_cpu(old_info->back),
+ -1, &bp, args->whichfork);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+ tmp_info = bp->data;
+ ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic));
+ ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
+ tmp_info->forw = cpu_to_be32(new_blk->blkno);
+ xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+ xfs_da_buf_done(bp);
+ }
+ old_info->back = cpu_to_be32(new_blk->blkno);
+ } else {
+ /*
+ * Link new block in after existing block.
+ */
+ new_info->forw = old_info->forw;
+ new_info->back = cpu_to_be32(old_blk->blkno);
+ if (old_info->forw) {
+ error = xfs_da_read_buf(args->trans, args->dp,
+ be32_to_cpu(old_info->forw),
+ -1, &bp, args->whichfork);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+ tmp_info = bp->data;
+ ASSERT(tmp_info->magic == old_info->magic);
+ ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
+ tmp_info->back = cpu_to_be32(new_blk->blkno);
+ xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+ xfs_da_buf_done(bp);
+ }
+ old_info->forw = cpu_to_be32(new_blk->blkno);
+ }
+
+ xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+ xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+ return(0);
+}
+
+/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp)
+{
+ xfs_da_intnode_t *node1, *node2;
+
+ node1 = node1_bp->data;
+ node2 = node2_bp->data;
+ ASSERT((be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC) &&
+ (be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC));
+ if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
+ ((be32_to_cpu(node2->btree[0].hashval) <
+ be32_to_cpu(node1->btree[0].hashval)) ||
+ (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
+ be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
+ return(1);
+ }
+ return(0);
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count)
+{
+ xfs_da_intnode_t *node;
+
+ node = bp->data;
+ ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+ if (count)
+ *count = be16_to_cpu(node->hdr.count);
+ if (!node->hdr.count)
+ return(0);
+ return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+}
+
+/*
+ * Unlink a block from a doubly linked list of blocks.
+ */
+STATIC int /* error */
+xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+ xfs_da_state_blk_t *save_blk)
+{
+ xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
+ xfs_da_args_t *args;
+ xfs_dabuf_t *bp;
+ int error;
+
+ /*
+ * Set up environment.
+ */
+ args = state->args;
+ ASSERT(args != NULL);
+ save_info = save_blk->bp->data;
+ drop_info = drop_blk->bp->data;
+ ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
+ save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+ save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+ ASSERT(save_blk->magic == be16_to_cpu(save_info->magic));
+ ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic));
+ ASSERT(save_blk->magic == drop_blk->magic);
+ ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
+ (be32_to_cpu(save_info->back) == drop_blk->blkno));
+ ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
+ (be32_to_cpu(drop_info->back) == save_blk->blkno));
+
+ /*
+ * Unlink the leaf block from the doubly linked chain of leaves.
+ */
+ if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
+ save_info->back = drop_info->back;
+ if (drop_info->back) {
+ error = xfs_da_read_buf(args->trans, args->dp,
+ be32_to_cpu(drop_info->back),
+ -1, &bp, args->whichfork);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+ tmp_info = bp->data;
+ ASSERT(tmp_info->magic == save_info->magic);
+ ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
+ tmp_info->forw = cpu_to_be32(save_blk->blkno);
+ xfs_da_log_buf(args->trans, bp, 0,
+ sizeof(*tmp_info) - 1);
+ xfs_da_buf_done(bp);
+ }
+ } else {
+ save_info->forw = drop_info->forw;
+ if (drop_info->forw) {
+ error = xfs_da_read_buf(args->trans, args->dp,
+ be32_to_cpu(drop_info->forw),
+ -1, &bp, args->whichfork);
+ if (error)
+ return(error);
+ ASSERT(bp != NULL);
+ tmp_info = bp->data;
+ ASSERT(tmp_info->magic == save_info->magic);
+ ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
+ tmp_info->back = cpu_to_be32(save_blk->blkno);
+ xfs_da_log_buf(args->trans, bp, 0,
+ sizeof(*tmp_info) - 1);
+ xfs_da_buf_done(bp);
+ }
+ }
+
+ xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+ return(0);
+}
+
+/*
+ * Move a path "forward" or "!forward" one block at the current level.
+ *
+ * This routine will adjust a "path" to point to the next block
+ * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
+ * Btree, including updating pointers to the intermediate nodes between
+ * the new bottom and the root.
+ */
+int /* error */
+xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+ int forward, int release, int *result)
+{
+ xfs_da_state_blk_t *blk;
+ xfs_da_blkinfo_t *info;
+ xfs_da_intnode_t *node;
+ xfs_da_args_t *args;
+ xfs_dablk_t blkno=0;
+ int level, error;
+
+ /*
+ * Roll up the Btree looking for the first block where our
+ * current index is not at the edge of the block. Note that
+ * we skip the bottom layer because we want the sibling block.
+ */
+ args = state->args;
+ ASSERT(args != NULL);
+ ASSERT(path != NULL);
+ ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ level = (path->active-1) - 1; /* skip bottom layer in path */
+ for (blk = &path->blk[level]; level >= 0; blk--, level--) {
+ ASSERT(blk->bp != NULL);
+ node = blk->bp->data;
+ ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
+ if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) {
+ blk->index++;
+ blkno = be32_to_cpu(node->btree[blk->index].before);
+ break;
+ } else if (!forward && (blk->index > 0)) {
+ blk->index--;
+ blkno = be32_to_cpu(node->btree[blk->index].before);
+ break;
+ }
+ }
+ if (level < 0) {
+ *result = XFS_ERROR(ENOENT); /* we're out of our tree */
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ return(0);
+ }
+
+ /*
+ * Roll down the edge of the subtree until we reach the
+ * same depth we were at originally.
+ */
+ for (blk++, level++; level < path->active; blk++, level++) {
+ /*
+ * Release the old block.
+ * (if it's dirty, trans won't actually let go)
+ */
+ if (release)
+ xfs_da_brelse(args->trans, blk->bp);
+
+ /*
+ * Read the next child block.
+ */
+ blk->blkno = blkno;
+ error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
+ &blk->bp, args->whichfork);
+ if (error)
+ return(error);
+ ASSERT(blk->bp != NULL);
+ info = blk->bp->data;
+ ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC ||
+ be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC ||
+ be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC);
+ blk->magic = be16_to_cpu(info->magic);
+ if (blk->magic == XFS_DA_NODE_MAGIC) {
+ node = (xfs_da_intnode_t *)info;
+ blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+ if (forward)
+ blk->index = 0;
+ else
+ blk->index = be16_to_cpu(node->hdr.count)-1;
+ blkno = be32_to_cpu(node->btree[blk->index].before);
+ } else {
+ ASSERT(level == path->active-1);
+ blk->index = 0;
+ switch(blk->magic) {
+ case XFS_ATTR_LEAF_MAGIC:
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
+ NULL);
+ break;
+ case XFS_DIR2_LEAFN_MAGIC:
+ blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
+ NULL);
+ break;
+ default:
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
+ blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ break;
+ }
+ }
+ }
+ *result = 0;
+ return(0);
+}
+
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+xfs_dahash_t
+xfs_da_hashname(const __uint8_t *name, int namelen)
+{
+ xfs_dahash_t hash;
+
+ /*
+ * Do four characters at a time as long as we can.
+ */
+ for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
+ hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+ (name[3] << 0) ^ rol32(hash, 7 * 4);
+
+ /*
+ * Now do the rest of the characters.
+ */
+ switch (namelen) {
+ case 3:
+ return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+ rol32(hash, 7 * 3);
+ case 2:
+ return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
+ case 1:
+ return (name[0] << 0) ^ rol32(hash, 7 * 1);
+ default: /* case 0: */
+ return hash;
+ }
+}
+
+enum xfs_dacmp
+xfs_da_compname(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+ XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+
+static xfs_dahash_t
+xfs_default_hashname(
+ struct xfs_name *name)
+{
+ return xfs_da_hashname(name->name, name->len);
+}
+
+const struct xfs_nameops xfs_default_nameops = {
+ .hashname = xfs_default_hashname,
+ .compname = xfs_da_compname
+};
+
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
+{
+ xfs_fileoff_t bno, b;
+ xfs_bmbt_irec_t map;
+ xfs_bmbt_irec_t *mapp;
+ xfs_inode_t *dp;
+ int nmap, error, w, count, c, got, i, mapi;
+ xfs_trans_t *tp;
+ xfs_mount_t *mp;
+ xfs_drfsbno_t nblks;
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ w = args->whichfork;
+ tp = args->trans;
+ nblks = dp->i_d.di_nblocks;
+
+ /*
+ * For new directories adjust the file offset and block count.
+ */
+ if (w == XFS_DATA_FORK) {
+ bno = mp->m_dirleafblk;
+ count = mp->m_dirblkfsbs;
+ } else {
+ bno = 0;
+ count = 1;
+ }
+ /*
+ * Find a spot in the file space to put the new block.
+ */
+ if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w)))
+ return error;
+ if (w == XFS_DATA_FORK)
+ ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk);
+ /*
+ * Try mapping it in one filesystem block.
+ */
+ nmap = 1;
+ ASSERT(args->firstblock != NULL);
+ if ((error = xfs_bmapi(tp, dp, bno, count,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
+ XFS_BMAPI_CONTIG,
+ args->firstblock, args->total, &map, &nmap,
+ args->flist))) {
+ return error;
+ }
+ ASSERT(nmap <= 1);
+ if (nmap == 1) {
+ mapp = &map;
+ mapi = 1;
+ }
+ /*
+ * If we didn't get it and the block might work if fragmented,
+ * try without the CONTIG flag. Loop until we get it all.
+ */
+ else if (nmap == 0 && count > 1) {
+ mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+ for (b = bno, mapi = 0; b < bno + count; ) {
+ nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+ c = (int)(bno + count - b);
+ if ((error = xfs_bmapi(tp, dp, b, c,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
+ XFS_BMAPI_METADATA,
+ args->firstblock, args->total,
+ &mapp[mapi], &nmap, args->flist))) {
+ kmem_free(mapp);
+ return error;
+ }
+ if (nmap < 1)
+ break;
+ mapi += nmap;
+ b = mapp[mapi - 1].br_startoff +
+ mapp[mapi - 1].br_blockcount;
+ }
+ } else {
+ mapi = 0;
+ mapp = NULL;
+ }
+ /*
+ * Count the blocks we got, make sure it matches the total.
+ */
+ for (i = 0, got = 0; i < mapi; i++)
+ got += mapp[i].br_blockcount;
+ if (got != count || mapp[0].br_startoff != bno ||
+ mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+ bno + count) {
+ if (mapp != &map)
+ kmem_free(mapp);
+ return XFS_ERROR(ENOSPC);
+ }
+ if (mapp != &map)
+ kmem_free(mapp);
+ /* account for newly allocated blocks in reserved blocks total */
+ args->total -= dp->i_d.di_nblocks - nblks;
+ *new_blkno = (xfs_dablk_t)bno;
+ return 0;
+}
+
+/*
+ * Ick. We need to always be able to remove a btree block, even
+ * if there's no space reservation because the filesystem is full.
+ * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
+ * It swaps the target block with the last block in the file. The
+ * last block in the file can always be removed since it can't cause
+ * a bmap btree split to do that.
+ */
+STATIC int
+xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
+ xfs_dabuf_t **dead_bufp)
+{
+ xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
+ xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf;
+ xfs_fileoff_t lastoff;
+ xfs_inode_t *ip;
+ xfs_trans_t *tp;
+ xfs_mount_t *mp;
+ int error, w, entno, level, dead_level;
+ xfs_da_blkinfo_t *dead_info, *sib_info;
+ xfs_da_intnode_t *par_node, *dead_node;
+ xfs_dir2_leaf_t *dead_leaf2;
+ xfs_dahash_t dead_hash;
+
+ dead_buf = *dead_bufp;
+ dead_blkno = *dead_blknop;
+ tp = args->trans;
+ ip = args->dp;
+ w = args->whichfork;
+ ASSERT(w == XFS_DATA_FORK);
+ mp = ip->i_mount;
+ lastoff = mp->m_dirfreeblk;
+ error = xfs_bmap_last_before(tp, ip, &lastoff, w);
+ if (error)
+ return error;
+ if (unlikely(lastoff == 0)) {
+ XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
+ mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ /*
+ * Read the last block in the btree space.
+ */
+ last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
+ if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+ return error;
+ /*
+ * Copy the last block into the dead buffer and log it.
+ */
+ memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize);
+ xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
+ dead_info = dead_buf->data;
+ /*
+ * Get values from the moved block.
+ */
+ if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) {
+ dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+ dead_level = 0;
+ dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval);
+ } else {
+ ASSERT(be16_to_cpu(dead_info->magic) == XFS_DA_NODE_MAGIC);
+ dead_node = (xfs_da_intnode_t *)dead_info;
+ dead_level = be16_to_cpu(dead_node->hdr.level);
+ dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval);
+ }
+ sib_buf = par_buf = NULL;
+ /*
+ * If the moved block has a left sibling, fix up the pointers.
+ */
+ if ((sib_blkno = be32_to_cpu(dead_info->back))) {
+ if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+ goto done;
+ sib_info = sib_buf->data;
+ if (unlikely(
+ be32_to_cpu(sib_info->forw) != last_blkno ||
+ sib_info->magic != dead_info->magic)) {
+ XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
+ XFS_ERRLEVEL_LOW, mp);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto done;
+ }
+ sib_info->forw = cpu_to_be32(dead_blkno);
+ xfs_da_log_buf(tp, sib_buf,
+ XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
+ sizeof(sib_info->forw)));
+ xfs_da_buf_done(sib_buf);
+ sib_buf = NULL;
+ }
+ /*
+ * If the moved block has a right sibling, fix up the pointers.
+ */
+ if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
+ if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+ goto done;
+ sib_info = sib_buf->data;
+ if (unlikely(
+ be32_to_cpu(sib_info->back) != last_blkno ||
+ sib_info->magic != dead_info->magic)) {
+ XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
+ XFS_ERRLEVEL_LOW, mp);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto done;
+ }
+ sib_info->back = cpu_to_be32(dead_blkno);
+ xfs_da_log_buf(tp, sib_buf,
+ XFS_DA_LOGRANGE(sib_info, &sib_info->back,
+ sizeof(sib_info->back)));
+ xfs_da_buf_done(sib_buf);
+ sib_buf = NULL;
+ }
+ par_blkno = mp->m_dirleafblk;
+ level = -1;
+ /*
+ * Walk down the tree looking for the parent of the moved block.
+ */
+ for (;;) {
+ if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+ goto done;
+ par_node = par_buf->data;
+ if (unlikely(
+ be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC ||
+ (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) {
+ XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
+ XFS_ERRLEVEL_LOW, mp);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto done;
+ }
+ level = be16_to_cpu(par_node->hdr.level);
+ for (entno = 0;
+ entno < be16_to_cpu(par_node->hdr.count) &&
+ be32_to_cpu(par_node->btree[entno].hashval) < dead_hash;
+ entno++)
+ continue;
+ if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) {
+ XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
+ XFS_ERRLEVEL_LOW, mp);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto done;
+ }
+ par_blkno = be32_to_cpu(par_node->btree[entno].before);
+ if (level == dead_level + 1)
+ break;
+ xfs_da_brelse(tp, par_buf);
+ par_buf = NULL;
+ }
+ /*
+ * We're in the right parent block.
+ * Look for the right entry.
+ */
+ for (;;) {
+ for (;
+ entno < be16_to_cpu(par_node->hdr.count) &&
+ be32_to_cpu(par_node->btree[entno].before) != last_blkno;
+ entno++)
+ continue;
+ if (entno < be16_to_cpu(par_node->hdr.count))
+ break;
+ par_blkno = be32_to_cpu(par_node->hdr.info.forw);
+ xfs_da_brelse(tp, par_buf);
+ par_buf = NULL;
+ if (unlikely(par_blkno == 0)) {
+ XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
+ XFS_ERRLEVEL_LOW, mp);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto done;
+ }
+ if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+ goto done;
+ par_node = par_buf->data;
+ if (unlikely(
+ be16_to_cpu(par_node->hdr.level) != level ||
+ be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC)) {
+ XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
+ XFS_ERRLEVEL_LOW, mp);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto done;
+ }
+ entno = 0;
+ }
+ /*
+ * Update the parent entry pointing to the moved block.
+ */
+ par_node->btree[entno].before = cpu_to_be32(dead_blkno);
+ xfs_da_log_buf(tp, par_buf,
+ XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
+ sizeof(par_node->btree[entno].before)));
+ xfs_da_buf_done(par_buf);
+ xfs_da_buf_done(dead_buf);
+ *dead_blknop = last_blkno;
+ *dead_bufp = last_buf;
+ return 0;
+done:
+ if (par_buf)
+ xfs_da_brelse(tp, par_buf);
+ if (sib_buf)
+ xfs_da_brelse(tp, sib_buf);
+ xfs_da_brelse(tp, last_buf);
+ return error;
+}
+
+/*
+ * Remove a btree block from a directory or attribute.
+ */
+int
+xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+ xfs_dabuf_t *dead_buf)
+{
+ xfs_inode_t *dp;
+ int done, error, w, count;
+ xfs_trans_t *tp;
+ xfs_mount_t *mp;
+
+ dp = args->dp;
+ w = args->whichfork;
+ tp = args->trans;
+ mp = dp->i_mount;
+ if (w == XFS_DATA_FORK)
+ count = mp->m_dirblkfsbs;
+ else
+ count = 1;
+ for (;;) {
+ /*
+ * Remove extents. If we get ENOSPC for a dir we have to move
+ * the last block to the place we want to kill.
+ */
+ if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
+ xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+ 0, args->firstblock, args->flist,
+ &done)) == ENOSPC) {
+ if (w != XFS_DATA_FORK)
+ break;
+ if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
+ &dead_buf)))
+ break;
+ } else {
+ break;
+ }
+ }
+ xfs_da_binval(tp, dead_buf);
+ return error;
+}
+
+/*
+ * See if the mapping(s) for this btree block are valid, i.e.
+ * don't contain holes, are logically contiguous, and cover the whole range.
+ */
+STATIC int
+xfs_da_map_covers_blocks(
+ int nmap,
+ xfs_bmbt_irec_t *mapp,
+ xfs_dablk_t bno,
+ int count)
+{
+ int i;
+ xfs_fileoff_t off;
+
+ for (i = 0, off = bno; i < nmap; i++) {
+ if (mapp[i].br_startblock == HOLESTARTBLOCK ||
+ mapp[i].br_startblock == DELAYSTARTBLOCK) {
+ return 0;
+ }
+ if (off != mapp[i].br_startoff) {
+ return 0;
+ }
+ off += mapp[i].br_blockcount;
+ }
+ return off == bno + count;
+}
+
+/*
+ * Make a dabuf.
+ * Used for get_buf, read_buf, read_bufr, and reada_buf.
+ */
+STATIC int
+xfs_da_do_buf(
+ xfs_trans_t *trans,
+ xfs_inode_t *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t *mappedbnop,
+ xfs_dabuf_t **bpp,
+ int whichfork,
+ int caller,
+ inst_t *ra)
+{
+ xfs_buf_t *bp = NULL;
+ xfs_buf_t **bplist;
+ int error=0;
+ int i;
+ xfs_bmbt_irec_t map;
+ xfs_bmbt_irec_t *mapp;
+ xfs_daddr_t mappedbno;
+ xfs_mount_t *mp;
+ int nbplist=0;
+ int nfsb;
+ int nmap;
+ xfs_dabuf_t *rbp;
+
+ mp = dp->i_mount;
+ nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
+ mappedbno = *mappedbnop;
+ /*
+ * Caller doesn't have a mapping. -2 means don't complain
+ * if we land in a hole.
+ */
+ if (mappedbno == -1 || mappedbno == -2) {
+ /*
+ * Optimize the one-block case.
+ */
+ if (nfsb == 1) {
+ xfs_fsblock_t fsb;
+
+ if ((error =
+ xfs_bmapi_single(trans, dp, whichfork, &fsb,
+ (xfs_fileoff_t)bno))) {
+ return error;
+ }
+ mapp = &map;
+ if (fsb == NULLFSBLOCK) {
+ nmap = 0;
+ } else {
+ map.br_startblock = fsb;
+ map.br_startoff = (xfs_fileoff_t)bno;
+ map.br_blockcount = 1;
+ nmap = 1;
+ }
+ } else {
+ mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
+ nmap = nfsb;
+ if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
+ nfsb,
+ XFS_BMAPI_METADATA |
+ xfs_bmapi_aflag(whichfork),
+ NULL, 0, mapp, &nmap, NULL)))
+ goto exit0;
+ }
+ } else {
+ map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
+ map.br_startoff = (xfs_fileoff_t)bno;
+ map.br_blockcount = nfsb;
+ mapp = &map;
+ nmap = 1;
+ }
+ if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) {
+ error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
+ if (unlikely(error == EFSCORRUPTED)) {
+ if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+ xfs_alert(mp, "%s: bno %lld dir: inode %lld",
+ __func__, (long long)bno,
+ (long long)dp->i_ino);
+ for (i = 0; i < nmap; i++) {
+ xfs_alert(mp,
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
+ i,
+ (long long)mapp[i].br_startoff,
+ (long long)mapp[i].br_startblock,
+ (long long)mapp[i].br_blockcount,
+ mapp[i].br_state);
+ }
+ }
+ XFS_ERROR_REPORT("xfs_da_do_buf(1)",
+ XFS_ERRLEVEL_LOW, mp);
+ }
+ goto exit0;
+ }
+ if (caller != 3 && nmap > 1) {
+ bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP);
+ nbplist = 0;
+ } else
+ bplist = NULL;
+ /*
+ * Turn the mapping(s) into buffer(s).
+ */
+ for (i = 0; i < nmap; i++) {
+ int nmapped;
+
+ mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock);
+ if (i == 0)
+ *mappedbnop = mappedbno;
+ nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount);
+ switch (caller) {
+ case 0:
+ bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
+ mappedbno, nmapped, 0);
+ error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO);
+ break;
+ case 1:
+ case 2:
+ bp = NULL;
+ error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp,
+ mappedbno, nmapped, 0, &bp);
+ break;
+ case 3:
+ xfs_buf_readahead(mp->m_ddev_targp, mappedbno, nmapped);
+ error = 0;
+ bp = NULL;
+ break;
+ }
+ if (error) {
+ if (bp)
+ xfs_trans_brelse(trans, bp);
+ goto exit1;
+ }
+ if (!bp)
+ continue;
+ if (caller == 1) {
+ if (whichfork == XFS_ATTR_FORK) {
+ XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
+ XFS_ATTR_BTREE_REF);
+ } else {
+ XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
+ XFS_DIR_BTREE_REF);
+ }
+ }
+ if (bplist) {
+ bplist[nbplist++] = bp;
+ }
+ }
+ /*
+ * Build a dabuf structure.
+ */
+ if (bplist) {
+ rbp = xfs_da_buf_make(nbplist, bplist, ra);
+ } else if (bp)
+ rbp = xfs_da_buf_make(1, &bp, ra);
+ else
+ rbp = NULL;
+ /*
+ * For read_buf, check the magic number.
+ */
+ if (caller == 1) {
+ xfs_dir2_data_t *data;
+ xfs_dir2_free_t *free;
+ xfs_da_blkinfo_t *info;
+ uint magic, magic1;
+
+ info = rbp->data;
+ data = rbp->data;
+ free = rbp->data;
+ magic = be16_to_cpu(info->magic);
+ magic1 = be32_to_cpu(data->hdr.magic);
+ if (unlikely(
+ XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
+ (magic != XFS_ATTR_LEAF_MAGIC) &&
+ (magic != XFS_DIR2_LEAF1_MAGIC) &&
+ (magic != XFS_DIR2_LEAFN_MAGIC) &&
+ (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
+ (magic1 != XFS_DIR2_DATA_MAGIC) &&
+ (be32_to_cpu(free->hdr.magic) != XFS_DIR2_FREE_MAGIC),
+ mp, XFS_ERRTAG_DA_READ_BUF,
+ XFS_RANDOM_DA_READ_BUF))) {
+ trace_xfs_da_btree_corrupt(rbp->bps[0], _RET_IP_);
+ XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
+ XFS_ERRLEVEL_LOW, mp, info);
+ error = XFS_ERROR(EFSCORRUPTED);
+ xfs_da_brelse(trans, rbp);
+ nbplist = 0;
+ goto exit1;
+ }
+ }
+ if (bplist) {
+ kmem_free(bplist);
+ }
+ if (mapp != &map) {
+ kmem_free(mapp);
+ }
+ if (bpp)
+ *bpp = rbp;
+ return 0;
+exit1:
+ if (bplist) {
+ for (i = 0; i < nbplist; i++)
+ xfs_trans_brelse(trans, bplist[i]);
+ kmem_free(bplist);
+ }
+exit0:
+ if (mapp != &map)
+ kmem_free(mapp);
+ if (bpp)
+ *bpp = NULL;
+ return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+ xfs_trans_t *trans,
+ xfs_inode_t *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ xfs_dabuf_t **bpp,
+ int whichfork)
+{
+ return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0,
+ (inst_t *)__return_address);
+}
+
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+ xfs_trans_t *trans,
+ xfs_inode_t *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ xfs_dabuf_t **bpp,
+ int whichfork)
+{
+ return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1,
+ (inst_t *)__return_address);
+}
+
+/*
+ * Readahead the dir/attr block.
+ */
+xfs_daddr_t
+xfs_da_reada_buf(
+ xfs_trans_t *trans,
+ xfs_inode_t *dp,
+ xfs_dablk_t bno,
+ int whichfork)
+{
+ xfs_daddr_t rval;
+
+ rval = -1;
+ if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3,
+ (inst_t *)__return_address))
+ return -1;
+ else
+ return rval;
+}
+
+kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
+kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+ return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+ int i;
+
+ for (i = 0; i < state->altpath.active; i++) {
+ if (state->altpath.blk[i].bp) {
+ if (state->altpath.blk[i].bp != state->path.blk[i].bp)
+ xfs_da_buf_done(state->altpath.blk[i].bp);
+ state->altpath.blk[i].bp = NULL;
+ }
+ }
+ state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+ int i;
+
+ xfs_da_state_kill_altpath(state);
+ for (i = 0; i < state->path.active; i++) {
+ if (state->path.blk[i].bp)
+ xfs_da_buf_done(state->path.blk[i].bp);
+ }
+ if (state->extravalid && state->extrablk.bp)
+ xfs_da_buf_done(state->extrablk.bp);
+#ifdef DEBUG
+ memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+ kmem_zone_free(xfs_da_state_zone, state);
+}
+
+#ifdef XFS_DABUF_DEBUG
+xfs_dabuf_t *xfs_dabuf_global_list;
+static DEFINE_SPINLOCK(xfs_dabuf_global_lock);
+#endif
+
+/*
+ * Create a dabuf.
+ */
+/* ARGSUSED */
+STATIC xfs_dabuf_t *
+xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
+{
+ xfs_buf_t *bp;
+ xfs_dabuf_t *dabuf;
+ int i;
+ int off;
+
+ if (nbuf == 1)
+ dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS);
+ else
+ dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS);
+ dabuf->dirty = 0;
+#ifdef XFS_DABUF_DEBUG
+ dabuf->ra = ra;
+ dabuf->target = XFS_BUF_TARGET(bps[0]);
+ dabuf->blkno = XFS_BUF_ADDR(bps[0]);
+#endif
+ if (nbuf == 1) {
+ dabuf->nbuf = 1;
+ bp = bps[0];
+ dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
+ dabuf->data = XFS_BUF_PTR(bp);
+ dabuf->bps[0] = bp;
+ } else {
+ dabuf->nbuf = nbuf;
+ for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
+ dabuf->bps[i] = bp = bps[i];
+ dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp));
+ }
+ dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
+ for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+ bp = bps[i];
+ memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp),
+ XFS_BUF_COUNT(bp));
+ }
+ }
+#ifdef XFS_DABUF_DEBUG
+ {
+ xfs_dabuf_t *p;
+
+ spin_lock(&xfs_dabuf_global_lock);
+ for (p = xfs_dabuf_global_list; p; p = p->next) {
+ ASSERT(p->blkno != dabuf->blkno ||
+ p->target != dabuf->target);
+ }
+ dabuf->prev = NULL;
+ if (xfs_dabuf_global_list)
+ xfs_dabuf_global_list->prev = dabuf;
+ dabuf->next = xfs_dabuf_global_list;
+ xfs_dabuf_global_list = dabuf;
+ spin_unlock(&xfs_dabuf_global_lock);
+ }
+#endif
+ return dabuf;
+}
+
+/*
+ * Un-dirty a dabuf.
+ */
+STATIC void
+xfs_da_buf_clean(xfs_dabuf_t *dabuf)
+{
+ xfs_buf_t *bp;
+ int i;
+ int off;
+
+ if (dabuf->dirty) {
+ ASSERT(dabuf->nbuf > 1);
+ dabuf->dirty = 0;
+ for (i = off = 0; i < dabuf->nbuf;
+ i++, off += XFS_BUF_COUNT(bp)) {
+ bp = dabuf->bps[i];
+ memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off,
+ XFS_BUF_COUNT(bp));
+ }
+ }
+}
+
+/*
+ * Release a dabuf.
+ */
+void
+xfs_da_buf_done(xfs_dabuf_t *dabuf)
+{
+ ASSERT(dabuf);
+ ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+ if (dabuf->dirty)
+ xfs_da_buf_clean(dabuf);
+ if (dabuf->nbuf > 1)
+ kmem_free(dabuf->data);
+#ifdef XFS_DABUF_DEBUG
+ {
+ spin_lock(&xfs_dabuf_global_lock);
+ if (dabuf->prev)
+ dabuf->prev->next = dabuf->next;
+ else
+ xfs_dabuf_global_list = dabuf->next;
+ if (dabuf->next)
+ dabuf->next->prev = dabuf->prev;
+ spin_unlock(&xfs_dabuf_global_lock);
+ }
+ memset(dabuf, 0, XFS_DA_BUF_SIZE(dabuf->nbuf));
+#endif
+ if (dabuf->nbuf == 1)
+ kmem_zone_free(xfs_dabuf_zone, dabuf);
+ else
+ kmem_free(dabuf);
+}
+
+/*
+ * Log transaction from a dabuf.
+ */
+void
+xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
+{
+ xfs_buf_t *bp;
+ uint f;
+ int i;
+ uint l;
+ int off;
+
+ ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+ if (dabuf->nbuf == 1) {
+ ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0]));
+ xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
+ return;
+ }
+ dabuf->dirty = 1;
+ ASSERT(first <= last);
+ for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+ bp = dabuf->bps[i];
+ f = off;
+ l = f + XFS_BUF_COUNT(bp) - 1;
+ if (f < first)
+ f = first;
+ if (l > last)
+ l = last;
+ if (f <= l)
+ xfs_trans_log_buf(tp, bp, f - off, l - off);
+ /*
+ * B_DONE is set by xfs_trans_log buf.
+ * If we don't set it on a new buffer (get not read)
+ * then if we don't put anything in the buffer it won't
+ * be set, and at commit it it released into the cache,
+ * and then a read will fail.
+ */
+ else if (!(XFS_BUF_ISDONE(bp)))
+ XFS_BUF_DONE(bp);
+ }
+ ASSERT(last < off);
+}
+
+/*
+ * Release dabuf from a transaction.
+ * Have to free up the dabuf before the buffers are released,
+ * since the synchronization on the dabuf is really the lock on the buffer.
+ */
+void
+xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
+{
+ xfs_buf_t *bp;
+ xfs_buf_t **bplist;
+ int i;
+ int nbuf;
+
+ ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+ if ((nbuf = dabuf->nbuf) == 1) {
+ bplist = &bp;
+ bp = dabuf->bps[0];
+ } else {
+ bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
+ memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
+ }
+ xfs_da_buf_done(dabuf);
+ for (i = 0; i < nbuf; i++)
+ xfs_trans_brelse(tp, bplist[i]);
+ if (bplist != &bp)
+ kmem_free(bplist);
+}
+
+/*
+ * Invalidate dabuf from a transaction.
+ */
+void
+xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
+{
+ xfs_buf_t *bp;
+ xfs_buf_t **bplist;
+ int i;
+ int nbuf;
+
+ ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+ if ((nbuf = dabuf->nbuf) == 1) {
+ bplist = &bp;
+ bp = dabuf->bps[0];
+ } else {
+ bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
+ memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
+ }
+ xfs_da_buf_done(dabuf);
+ for (i = 0; i < nbuf; i++)
+ xfs_trans_binval(tp, bplist[i]);
+ if (bplist != &bp)
+ kmem_free(bplist);
+}
+
+/*
+ * Get the first daddr from a dabuf.
+ */
+xfs_daddr_t
+xfs_da_blkno(xfs_dabuf_t *dabuf)
+{
+ ASSERT(dabuf->nbuf);
+ ASSERT(dabuf->data);
+ return XFS_BUF_ADDR(dabuf->bps[0]);
+}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
new file mode 100644
index 00000000..fe9f5a8c
--- /dev/null
+++ b/fs/xfs/xfs_da_btree.h
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DA_BTREE_H__
+#define __XFS_DA_BTREE_H__
+
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct zone;
+
+/*========================================================================
+ * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * Is is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
+#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
+#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
+
+typedef struct xfs_da_blkinfo {
+ __be32 forw; /* previous block in list */
+ __be32 back; /* following block in list */
+ __be16 magic; /* validity check on block */
+ __be16 pad; /* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
+
+typedef struct xfs_da_intnode {
+ struct xfs_da_node_hdr { /* constant-structure header block */
+ xfs_da_blkinfo_t info; /* block type, links, etc. */
+ __be16 count; /* count of active entries */
+ __be16 level; /* level above leaves (leaf == 0) */
+ } hdr;
+ struct xfs_da_node_entry {
+ __be32 hashval; /* hash value for this descendant */
+ __be32 before; /* Btree block before this key */
+ } btree[1]; /* variable sized array of keys */
+} xfs_da_intnode_t;
+typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
+typedef struct xfs_da_node_entry xfs_da_node_entry_t;
+
+#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize
+
+/*========================================================================
+ * Btree searching and modification structure definitions.
+ *========================================================================*/
+
+/*
+ * Search comparison results
+ */
+enum xfs_dacmp {
+ XFS_CMP_DIFFERENT, /* names are completely different */
+ XFS_CMP_EXACT, /* names are exactly the same */
+ XFS_CMP_CASE /* names are same but differ in case */
+};
+
+/*
+ * Structure to ease passing around component names.
+ */
+typedef struct xfs_da_args {
+ const __uint8_t *name; /* string (maybe not NULL terminated) */
+ int namelen; /* length of string (maybe no NULL) */
+ __uint8_t *value; /* set of bytes (maybe contain NULLs) */
+ int valuelen; /* length of value */
+ int flags; /* argument flags (eg: ATTR_NOCREATE) */
+ xfs_dahash_t hashval; /* hash value of name */
+ xfs_ino_t inumber; /* input/output inode number */
+ struct xfs_inode *dp; /* directory inode to manipulate */
+ xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */
+ struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */
+ struct xfs_trans *trans; /* current trans (changes over time) */
+ xfs_extlen_t total; /* total blocks needed, for 1st bmap */
+ int whichfork; /* data or attribute fork */
+ xfs_dablk_t blkno; /* blkno of attr leaf of interest */
+ int index; /* index of attr of interest in blk */
+ xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
+ int rmtblkcnt; /* remote attr value block count */
+ xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
+ int index2; /* index of 2nd attr in blk */
+ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
+ int rmtblkcnt2; /* remote attr value block count */
+ int op_flags; /* operation flags */
+ enum xfs_dacmp cmpresult; /* name compare result for lookups */
+} xfs_da_args_t;
+
+/*
+ * Operation flags:
+ */
+#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */
+#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */
+#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
+#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
+
+#define XFS_DA_OP_FLAGS \
+ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
+ { XFS_DA_OP_RENAME, "RENAME" }, \
+ { XFS_DA_OP_ADDNAME, "ADDNAME" }, \
+ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \
+ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }
+
+/*
+ * Structure to describe buffer(s) for a block.
+ * This is needed in the directory version 2 format case, when
+ * multiple non-contiguous fsblocks might be needed to cover one
+ * logical directory block.
+ * If the buffer count is 1 then the data pointer points to the
+ * same place as the b_addr field for the buffer, else to kmem_alloced memory.
+ */
+typedef struct xfs_dabuf {
+ int nbuf; /* number of buffer pointers present */
+ short dirty; /* data needs to be copied back */
+ short bbcount; /* how large is data in bbs */
+ void *data; /* pointer for buffers' data */
+#ifdef XFS_DABUF_DEBUG
+ inst_t *ra; /* return address of caller to make */
+ struct xfs_dabuf *next; /* next in global chain */
+ struct xfs_dabuf *prev; /* previous in global chain */
+ struct xfs_buftarg *target; /* device for buffer */
+ xfs_daddr_t blkno; /* daddr first in bps[0] */
+#endif
+ struct xfs_buf *bps[1]; /* actually nbuf of these */
+} xfs_dabuf_t;
+#define XFS_DA_BUF_SIZE(n) \
+ (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1))
+
+#ifdef XFS_DABUF_DEBUG
+extern xfs_dabuf_t *xfs_dabuf_global_list;
+#endif
+
+/*
+ * Storage for holding state during Btree searches and split/join ops.
+ *
+ * Only need space for 5 intermediate nodes. With a minimum of 62-way
+ * fanout to the Btree, we can support over 900 million directory blocks,
+ * which is slightly more than enough.
+ */
+typedef struct xfs_da_state_blk {
+ xfs_dabuf_t *bp; /* buffer containing block */
+ xfs_dablk_t blkno; /* filesystem blkno of buffer */
+ xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */
+ int index; /* relevant index into block */
+ xfs_dahash_t hashval; /* last hash value in block */
+ int magic; /* blk's magic number, ie: blk type */
+} xfs_da_state_blk_t;
+
+typedef struct xfs_da_state_path {
+ int active; /* number of active levels */
+ xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH];
+} xfs_da_state_path_t;
+
+typedef struct xfs_da_state {
+ xfs_da_args_t *args; /* filename arguments */
+ struct xfs_mount *mp; /* filesystem mount point */
+ unsigned int blocksize; /* logical block size */
+ unsigned int node_ents; /* how many entries in danode */
+ xfs_da_state_path_t path; /* search/split paths */
+ xfs_da_state_path_t altpath; /* alternate path for join */
+ unsigned char inleaf; /* insert into 1->lf, 0->splf */
+ unsigned char extravalid; /* T/F: extrablk is in use */
+ unsigned char extraafter; /* T/F: extrablk is after new */
+ xfs_da_state_blk_t extrablk; /* for double-splits on leaves */
+ /* for dirv2 extrablk is data */
+} xfs_da_state_t;
+
+/*
+ * Utility macros to aid in logging changed structure fields.
+ */
+#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE))
+#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \
+ (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
+ (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+
+/*
+ * Name ops for directory and/or attr name operations
+ */
+struct xfs_nameops {
+ xfs_dahash_t (*hashname)(struct xfs_name *);
+ enum xfs_dacmp (*compname)(struct xfs_da_args *,
+ const unsigned char *, int);
+};
+
+
+/*========================================================================
+ * Function prototypes.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
+ xfs_dabuf_t **bpp, int whichfork);
+int xfs_da_split(xfs_da_state_t *state);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int xfs_da_join(xfs_da_state_t *state);
+void xfs_da_fixhashpath(xfs_da_state_t *state,
+ xfs_da_state_path_t *path_to_to_fix);
+
+/*
+ * Routines used for finding things in the Btree.
+ */
+int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result);
+int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+ int forward, int release, int *result);
+/*
+ * Utility routines.
+ */
+int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+ xfs_da_state_blk_t *new_blk);
+
+/*
+ * Utility routines.
+ */
+int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
+int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mappedbno,
+ xfs_dabuf_t **bp, int whichfork);
+int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mappedbno,
+ xfs_dabuf_t **bpp, int whichfork);
+xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+ xfs_dablk_t bno, int whichfork);
+int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+ xfs_dabuf_t *dead_buf);
+
+uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
+enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
+ const unsigned char *name, int len);
+
+
+xfs_da_state_t *xfs_da_state_alloc(void);
+void xfs_da_state_free(xfs_da_state_t *state);
+
+void xfs_da_buf_done(xfs_dabuf_t *dabuf);
+void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first,
+ uint last);
+void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
+void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
+xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
+
+extern struct kmem_zone *xfs_da_state_zone;
+extern struct kmem_zone *xfs_dabuf_zone;
+extern const struct xfs_nameops xfs_default_nameops;
+
+#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
new file mode 100644
index 00000000..9a84a85c
--- /dev/null
+++ b/fs/xfs/xfs_dfrag.c
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_itable.h"
+#include "xfs_dfrag.h"
+#include "xfs_error.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+
+static int xfs_swap_extents(
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip, /* tmp inode */
+ xfs_swapext_t *sxp);
+
+/*
+ * ioctl interface for swapext
+ */
+int
+xfs_swapext(
+ xfs_swapext_t *sxp)
+{
+ xfs_inode_t *ip, *tip;
+ struct file *file, *tmp_file;
+ int error = 0;
+
+ /* Pull information for the target fd */
+ file = fget((int)sxp->sx_fdtarget);
+ if (!file) {
+ error = XFS_ERROR(EINVAL);
+ goto out;
+ }
+
+ if (!(file->f_mode & FMODE_WRITE) ||
+ !(file->f_mode & FMODE_READ) ||
+ (file->f_flags & O_APPEND)) {
+ error = XFS_ERROR(EBADF);
+ goto out_put_file;
+ }
+
+ tmp_file = fget((int)sxp->sx_fdtmp);
+ if (!tmp_file) {
+ error = XFS_ERROR(EINVAL);
+ goto out_put_file;
+ }
+
+ if (!(tmp_file->f_mode & FMODE_WRITE) ||
+ !(tmp_file->f_mode & FMODE_READ) ||
+ (tmp_file->f_flags & O_APPEND)) {
+ error = XFS_ERROR(EBADF);
+ goto out_put_tmp_file;
+ }
+
+ if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
+ IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) {
+ error = XFS_ERROR(EINVAL);
+ goto out_put_tmp_file;
+ }
+
+ ip = XFS_I(file->f_path.dentry->d_inode);
+ tip = XFS_I(tmp_file->f_path.dentry->d_inode);
+
+ if (ip->i_mount != tip->i_mount) {
+ error = XFS_ERROR(EINVAL);
+ goto out_put_tmp_file;
+ }
+
+ if (ip->i_ino == tip->i_ino) {
+ error = XFS_ERROR(EINVAL);
+ goto out_put_tmp_file;
+ }
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ error = XFS_ERROR(EIO);
+ goto out_put_tmp_file;
+ }
+
+ error = xfs_swap_extents(ip, tip, sxp);
+
+ out_put_tmp_file:
+ fput(tmp_file);
+ out_put_file:
+ fput(file);
+ out:
+ return error;
+}
+
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6. If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip) /* tmp inode */
+{
+
+ /* Should never get a local format */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+ tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ return EINVAL;
+
+ /*
+ * if the target inode has less extents that then temporary inode then
+ * why did userspace call us?
+ */
+ if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+ return EINVAL;
+
+ /*
+ * if the target inode is in extent form and the temp inode is in btree
+ * form then we will end up with the target inode in the wrong format
+ * as we already know there are less extents in the temp inode.
+ */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ return EINVAL;
+
+ /* Check temp in extent form to max in target */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+ return EINVAL;
+
+ /* Check target in extent form to max in temp */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+ return EINVAL;
+
+ /*
+ * If we are in a btree format, check that the temp root block will fit
+ * in the target and that it has enough extents to be in btree format
+ * in the target.
+ *
+ * Note that we have to be careful to allow btree->extent conversions
+ * (a common defrag case) which will occur when the temp inode is in
+ * extent format...
+ */
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ ((XFS_IFORK_BOFF(ip) &&
+ tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
+ return EINVAL;
+
+ /* Reciprocal target->temp btree format checks */
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ ((XFS_IFORK_BOFF(tip) &&
+ ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
+ return EINVAL;
+
+ return 0;
+}
+
+static int
+xfs_swap_extents(
+ xfs_inode_t *ip, /* target inode */
+ xfs_inode_t *tip, /* tmp inode */
+ xfs_swapext_t *sxp)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_trans_t *tp;
+ xfs_bstat_t *sbp = &sxp->sx_stat;
+ xfs_ifork_t *tempifp, *ifp, *tifp;
+ int ilf_fields, tilf_fields;
+ int error = 0;
+ int aforkblks = 0;
+ int taforkblks = 0;
+ __uint64_t tmp;
+
+ tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
+ if (!tempifp) {
+ error = XFS_ERROR(ENOMEM);
+ goto out;
+ }
+
+ /*
+ * we have to do two separate lock calls here to keep lockdep
+ * happy. If we try to get all the locks in one call, lock will
+ * report false positives when we drop the ILOCK and regain them
+ * below.
+ */
+ xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+ /* Verify that both files have the same format */
+ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+
+ /* Verify both files are either real-time or non-realtime */
+ if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+
+ if (VN_CACHED(VFS_I(tip)) != 0) {
+ error = xfs_flushinval_pages(tip, 0, -1,
+ FI_REMAPF_LOCKED);
+ if (error)
+ goto out_unlock;
+ }
+
+ /* Verify O_DIRECT for ftmp */
+ if (VN_CACHED(VFS_I(tip)) != 0) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+
+ /* Verify all data are being swapped */
+ if (sxp->sx_offset != 0 ||
+ sxp->sx_length != ip->i_d.di_size ||
+ sxp->sx_length != tip->i_d.di_size) {
+ error = XFS_ERROR(EFAULT);
+ goto out_unlock;
+ }
+
+ trace_xfs_swap_extent_before(ip, 0);
+ trace_xfs_swap_extent_before(tip, 1);
+
+ /* check inode formats now that data is flushed */
+ error = xfs_swap_extents_check_format(ip, tip);
+ if (error) {
+ xfs_notice(mp,
+ "%s: inode 0x%llx format is incompatible for exchanging.",
+ __func__, ip->i_ino);
+ goto out_unlock;
+ }
+
+ /*
+ * Compare the current change & modify times with that
+ * passed in. If they differ, we abort this swap.
+ * This is the mechanism used to ensure the calling
+ * process that the file was not changed out from
+ * under it.
+ */
+ if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+ (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+ (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+ (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+ error = XFS_ERROR(EBUSY);
+ goto out_unlock;
+ }
+
+ /* We need to fail if the file is memory mapped. Once we have tossed
+ * all existing pages, the page fault will have no option
+ * but to go to the filesystem for pages. By making the page fault call
+ * vop_read (or write in the case of autogrow) they block on the iolock
+ * until we have switched the extents.
+ */
+ if (VN_MAPPED(VFS_I(ip))) {
+ error = XFS_ERROR(EBUSY);
+ goto out_unlock;
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(tip, XFS_ILOCK_EXCL);
+
+ /*
+ * There is a race condition here since we gave up the
+ * ilock. However, the data fork will not change since
+ * we have the iolock (locked for truncation too) so we
+ * are safe. We don't really care if non-io related
+ * fields change.
+ */
+
+ xfs_tosspages(ip, 0, -1, FI_REMAPF);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+ if ((error = xfs_trans_reserve(tp, 0,
+ XFS_ICHANGE_LOG_RES(mp), 0,
+ 0, 0))) {
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ xfs_iunlock(tip, XFS_IOLOCK_EXCL);
+ xfs_trans_cancel(tp, 0);
+ goto out;
+ }
+ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+ /*
+ * Count the number of extended attribute blocks
+ */
+ if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
+ (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+ error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+ if (error)
+ goto out_trans_cancel;
+ }
+ if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
+ (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+ error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
+ &taforkblks);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
+ * Swap the data forks of the inodes
+ */
+ ifp = &ip->i_df;
+ tifp = &tip->i_df;
+ *tempifp = *ifp; /* struct copy */
+ *ifp = *tifp; /* struct copy */
+ *tifp = *tempifp; /* struct copy */
+
+ /*
+ * Fix the in-memory data fork values that are dependent on the fork
+ * offset in the inode. We can't assume they remain the same as attr2
+ * has dynamic fork offsets.
+ */
+ ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
+ (uint)sizeof(xfs_bmbt_rec_t);
+ tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
+ (uint)sizeof(xfs_bmbt_rec_t);
+
+ /*
+ * Fix the on-disk inode values
+ */
+ tmp = (__uint64_t)ip->i_d.di_nblocks;
+ ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+ tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
+
+ tmp = (__uint64_t) ip->i_d.di_nextents;
+ ip->i_d.di_nextents = tip->i_d.di_nextents;
+ tip->i_d.di_nextents = tmp;
+
+ tmp = (__uint64_t) ip->i_d.di_format;
+ ip->i_d.di_format = tip->i_d.di_format;
+ tip->i_d.di_format = tmp;
+
+ /*
+ * The extents in the source inode could still contain speculative
+ * preallocation beyond EOF (e.g. the file is open but not modified
+ * while defrag is in progress). In that case, we need to copy over the
+ * number of delalloc blocks the data fork in the source inode is
+ * tracking beyond EOF so that when the fork is truncated away when the
+ * temporary inode is unlinked we don't underrun the i_delayed_blks
+ * counter on that inode.
+ */
+ ASSERT(tip->i_delayed_blks == 0);
+ tip->i_delayed_blks = ip->i_delayed_blks;
+ ip->i_delayed_blks = 0;
+
+ ilf_fields = XFS_ILOG_CORE;
+
+ switch(ip->i_d.di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ /* If the extents fit in the inode, fix the
+ * pointer. Otherwise it's already NULL or
+ * pointing to the extent.
+ */
+ if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+ ifp->if_u1.if_extents =
+ ifp->if_u2.if_inline_ext;
+ }
+ ilf_fields |= XFS_ILOG_DEXT;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ ilf_fields |= XFS_ILOG_DBROOT;
+ break;
+ }
+
+ tilf_fields = XFS_ILOG_CORE;
+
+ switch(tip->i_d.di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ /* If the extents fit in the inode, fix the
+ * pointer. Otherwise it's already NULL or
+ * pointing to the extent.
+ */
+ if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+ tifp->if_u1.if_extents =
+ tifp->if_u2.if_inline_ext;
+ }
+ tilf_fields |= XFS_ILOG_DEXT;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ tilf_fields |= XFS_ILOG_DBROOT;
+ break;
+ }
+
+
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+ xfs_trans_log_inode(tp, ip, ilf_fields);
+ xfs_trans_log_inode(tp, tip, tilf_fields);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * transaction goes to disk before returning to the user.
+ */
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(tp);
+
+ error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
+
+ trace_xfs_swap_extent_after(ip, 0);
+ trace_xfs_swap_extent_after(tip, 1);
+out:
+ kmem_free(tempifp);
+ return error;
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ goto out;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp, 0);
+ goto out_unlock;
+}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
new file mode 100644
index 00000000..20bdd935
--- /dev/null
+++ b/fs/xfs/xfs_dfrag.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DFRAG_H__
+#define __XFS_DFRAG_H__
+
+/*
+ * Structure passed to xfs_swapext
+ */
+
+typedef struct xfs_swapext
+{
+ __int64_t sx_version; /* version */
+ __int64_t sx_fdtarget; /* fd of target file */
+ __int64_t sx_fdtmp; /* fd of tmp file */
+ xfs_off_t sx_offset; /* offset into file */
+ xfs_off_t sx_length; /* leng from offset */
+ char sx_pad[16]; /* pad space, unused */
+ xfs_bstat_t sx_stat; /* stat of target b4 copy */
+} xfs_swapext_t;
+
+/*
+ * Version flag
+ */
+#define XFS_SX_VERSION 0
+
+#ifdef __KERNEL__
+/*
+ * Prototypes for visible xfs_dfrag.c routines.
+ */
+
+/*
+ * Syscall interface for xfs_swapext
+ */
+int xfs_swapext(struct xfs_swapext *sx);
+
+#endif /* __KERNEL__ */
+
+#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
new file mode 100644
index 00000000..dffba9ba
--- /dev/null
+++ b/fs/xfs/xfs_dinode.h
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DINODE_H__
+#define __XFS_DINODE_H__
+
+#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
+#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2))
+
+typedef struct xfs_timestamp {
+ __be32 t_sec; /* timestamp seconds */
+ __be32 t_nsec; /* timestamp nanoseconds */
+} xfs_timestamp_t;
+
+/*
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat. To access the data and
+ * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
+ */
+typedef struct xfs_dinode {
+ __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
+ __be16 di_mode; /* mode and type of file */
+ __u8 di_version; /* inode version */
+ __u8 di_format; /* format of di_c data */
+ __be16 di_onlink; /* old number of links to file */
+ __be32 di_uid; /* owner's user id */
+ __be32 di_gid; /* owner's group id */
+ __be32 di_nlink; /* number of links to file */
+ __be16 di_projid_lo; /* lower part of owner's project id */
+ __be16 di_projid_hi; /* higher part owner's project id */
+ __u8 di_pad[6]; /* unused, zeroed space */
+ __be16 di_flushiter; /* incremented on flush */
+ xfs_timestamp_t di_atime; /* time last accessed */
+ xfs_timestamp_t di_mtime; /* time last modified */
+ xfs_timestamp_t di_ctime; /* time created/inode modified */
+ __be64 di_size; /* number of bytes in file */
+ __be64 di_nblocks; /* # of direct & btree blocks used */
+ __be32 di_extsize; /* basic/minimum extent size for file */
+ __be32 di_nextents; /* number of extents in data fork */
+ __be16 di_anextents; /* number of extents in attribute fork*/
+ __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
+ __s8 di_aformat; /* format of attr fork's data */
+ __be32 di_dmevmask; /* DMIG event mask */
+ __be16 di_dmstate; /* DMIG state info */
+ __be16 di_flags; /* random flags, XFS_DIFLAG_... */
+ __be32 di_gen; /* generation number */
+
+ /* di_next_unlinked is the only non-core field in the old dinode */
+ __be32 di_next_unlinked;/* agi unlinked list ptr */
+} __attribute__((packed)) xfs_dinode_t;
+
+#define DI_MAX_FLUSH 0xffff
+
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
+ */
+#define XFS_MAXLINK ((1U << 31) - 1U)
+#define XFS_MAXLINK_1 65535U
+
+/*
+ * Values for di_format
+ */
+typedef enum xfs_dinode_fmt {
+ XFS_DINODE_FMT_DEV, /* xfs_dev_t */
+ XFS_DINODE_FMT_LOCAL, /* bulk data */
+ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
+ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
+ XFS_DINODE_FMT_UUID /* uuid_t */
+} xfs_dinode_fmt_t;
+
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define XFS_DINODE_MIN_LOG 8
+#define XFS_DINODE_MAX_LOG 11
+#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG)
+#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG)
+
+/*
+ * Inode size for given fs.
+ */
+#define XFS_LITINO(mp) \
+ ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
+
+#define XFS_BROOT_SIZE_ADJ \
+ (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
+
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
+#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
+
+#define XFS_DFORK_DSIZE(dip,mp) \
+ (XFS_DFORK_Q(dip) ? \
+ XFS_DFORK_BOFF(dip) : \
+ XFS_LITINO(mp))
+#define XFS_DFORK_ASIZE(dip,mp) \
+ (XFS_DFORK_Q(dip) ? \
+ XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : \
+ 0)
+#define XFS_DFORK_SIZE(dip,mp,w) \
+ ((w) == XFS_DATA_FORK ? \
+ XFS_DFORK_DSIZE(dip, mp) : \
+ XFS_DFORK_ASIZE(dip, mp))
+
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+ ((char *)(dip) + sizeof(struct xfs_dinode))
+#define XFS_DFORK_APTR(dip) \
+ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
+#define XFS_DFORK_PTR(dip,w) \
+ ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+
+#define XFS_DFORK_FORMAT(dip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ (dip)->di_format : \
+ (dip)->di_aformat)
+#define XFS_DFORK_NEXTENTS(dip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ be32_to_cpu((dip)->di_nextents) : \
+ be16_to_cpu((dip)->di_anextents))
+
+#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp))
+
+/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+ return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+ *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+
+/*
+ * Values for di_flags
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_XFLAG_s.
+ */
+#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */
+#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */
+#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */
+#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */
+#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */
+#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */
+#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */
+#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */
+#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */
+#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */
+#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
+#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
+#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT)
+#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT)
+#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT)
+#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT)
+#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT)
+#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT)
+#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT)
+#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
+#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT)
+#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT)
+#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT)
+
+#ifdef CONFIG_XFS_RT
+#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+#else
+#define XFS_IS_REALTIME_INODE(ip) (0)
+#endif
+
+#define XFS_DIFLAG_ANY \
+ (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
+ XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
+ XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
+ XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
+ XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
+
+#endif /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
new file mode 100644
index 00000000..dba7a71c
--- /dev/null
+++ b/fs/xfs/xfs_dir2.c
@@ -0,0 +1,764 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_error.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
+
+/*
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
+ */
+STATIC xfs_dahash_t
+xfs_ascii_ci_hashname(
+ struct xfs_name *name)
+{
+ xfs_dahash_t hash;
+ int i;
+
+ for (i = 0, hash = 0; i < name->len; i++)
+ hash = tolower(name->name[i]) ^ rol32(hash, 7);
+
+ return hash;
+}
+
+STATIC enum xfs_dacmp
+xfs_ascii_ci_compname(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ enum xfs_dacmp result;
+ int i;
+
+ if (args->namelen != len)
+ return XFS_CMP_DIFFERENT;
+
+ result = XFS_CMP_EXACT;
+ for (i = 0; i < len; i++) {
+ if (args->name[i] == name[i])
+ continue;
+ if (tolower(args->name[i]) != tolower(name[i]))
+ return XFS_CMP_DIFFERENT;
+ result = XFS_CMP_CASE;
+ }
+
+ return result;
+}
+
+static struct xfs_nameops xfs_ascii_ci_nameops = {
+ .hashname = xfs_ascii_ci_hashname,
+ .compname = xfs_ascii_ci_compname,
+};
+
+void
+xfs_dir_mount(
+ xfs_mount_t *mp)
+{
+ ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb));
+ ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
+ XFS_MAX_BLOCKSIZE);
+ mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
+ mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
+ mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp));
+ mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
+ mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
+ mp->m_attr_node_ents =
+ (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) /
+ (uint)sizeof(xfs_da_node_entry_t);
+ mp->m_dir_node_ents =
+ (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
+ (uint)sizeof(xfs_da_node_entry_t);
+ mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
+ if (xfs_sb_version_hasasciici(&mp->m_sb))
+ mp->m_dirnameops = &xfs_ascii_ci_nameops;
+ else
+ mp->m_dirnameops = &xfs_default_nameops;
+}
+
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+int
+xfs_dir_isempty(
+ xfs_inode_t *dp)
+{
+ xfs_dir2_sf_t *sfp;
+
+ ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+ if (dp->i_d.di_size == 0) /* might happen during shutdown. */
+ return 1;
+ if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+ return 0;
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ return !sfp->hdr.count;
+}
+
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+ xfs_mount_t *mp,
+ xfs_ino_t ino)
+{
+ xfs_agblock_t agblkno;
+ xfs_agino_t agino;
+ xfs_agnumber_t agno;
+ int ino_ok;
+ int ioff;
+
+ agno = XFS_INO_TO_AGNO(mp, ino);
+ agblkno = XFS_INO_TO_AGBNO(mp, ino);
+ ioff = XFS_INO_TO_OFFSET(mp, ino);
+ agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+ ino_ok =
+ agno < mp->m_sb.sb_agcount &&
+ agblkno < mp->m_sb.sb_agblocks &&
+ agblkno != 0 &&
+ ioff < (1 << mp->m_sb.sb_inopblog) &&
+ XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+ if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+ XFS_RANDOM_DIR_INO_VALIDATE))) {
+ xfs_warn(mp, "Invalid inode number 0x%Lx",
+ (unsigned long long) ino);
+ XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ return 0;
+}
+
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+int
+xfs_dir_init(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ xfs_inode_t *pdp)
+{
+ xfs_da_args_t args;
+ int error;
+
+ memset((char *)&args, 0, sizeof(args));
+ args.dp = dp;
+ args.trans = tp;
+ ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+ if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino)))
+ return error;
+ return xfs_dir2_sf_create(&args, pdp->i_ino);
+}
+
+/*
+ Enter a name in a directory.
+ */
+int
+xfs_dir_createname(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ xfs_ino_t inum, /* new entry inode number */
+ xfs_fsblock_t *first, /* bmap's firstblock */
+ xfs_bmap_free_t *flist, /* bmap's freeblock list */
+ xfs_extlen_t total) /* bmap's total block count */
+{
+ xfs_da_args_t args;
+ int rval;
+ int v; /* type-checking value */
+
+ ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+ if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
+ return rval;
+ XFS_STATS_INC(xs_dir_create);
+
+ memset(&args, 0, sizeof(xfs_da_args_t));
+ args.name = name->name;
+ args.namelen = name->len;
+ args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args.inumber = inum;
+ args.dp = dp;
+ args.firstblock = first;
+ args.flist = flist;
+ args.total = total;
+ args.whichfork = XFS_DATA_FORK;
+ args.trans = tp;
+ args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ rval = xfs_dir2_sf_addname(&args);
+ else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+ return rval;
+ else if (v)
+ rval = xfs_dir2_block_addname(&args);
+ else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+ return rval;
+ else if (v)
+ rval = xfs_dir2_leaf_addname(&args);
+ else
+ rval = xfs_dir2_node_addname(&args);
+ return rval;
+}
+
+/*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ if (args->cmpresult == XFS_CMP_DIFFERENT)
+ return ENOENT;
+ if (args->cmpresult != XFS_CMP_CASE ||
+ !(args->op_flags & XFS_DA_OP_CILOOKUP))
+ return EEXIST;
+
+ args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+ if (!args->value)
+ return ENOMEM;
+
+ memcpy(args->value, name, len);
+ args->valuelen = len;
+ return EEXIST;
+}
+
+/*
+ * Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
+ */
+
+int
+xfs_dir_lookup(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ xfs_ino_t *inum, /* out: inode number */
+ struct xfs_name *ci_name) /* out: actual name if CI match */
+{
+ xfs_da_args_t args;
+ int rval;
+ int v; /* type-checking value */
+
+ ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+ XFS_STATS_INC(xs_dir_lookup);
+
+ memset(&args, 0, sizeof(xfs_da_args_t));
+ args.name = name->name;
+ args.namelen = name->len;
+ args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args.dp = dp;
+ args.whichfork = XFS_DATA_FORK;
+ args.trans = tp;
+ args.op_flags = XFS_DA_OP_OKNOENT;
+ if (ci_name)
+ args.op_flags |= XFS_DA_OP_CILOOKUP;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ rval = xfs_dir2_sf_lookup(&args);
+ else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+ return rval;
+ else if (v)
+ rval = xfs_dir2_block_lookup(&args);
+ else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+ return rval;
+ else if (v)
+ rval = xfs_dir2_leaf_lookup(&args);
+ else
+ rval = xfs_dir2_node_lookup(&args);
+ if (rval == EEXIST)
+ rval = 0;
+ if (!rval) {
+ *inum = args.inumber;
+ if (ci_name) {
+ ci_name->name = args.value;
+ ci_name->len = args.valuelen;
+ }
+ }
+ return rval;
+}
+
+/*
+ * Remove an entry from a directory.
+ */
+int
+xfs_dir_removename(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ xfs_ino_t ino,
+ xfs_fsblock_t *first, /* bmap's firstblock */
+ xfs_bmap_free_t *flist, /* bmap's freeblock list */
+ xfs_extlen_t total) /* bmap's total block count */
+{
+ xfs_da_args_t args;
+ int rval;
+ int v; /* type-checking value */
+
+ ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+ XFS_STATS_INC(xs_dir_remove);
+
+ memset(&args, 0, sizeof(xfs_da_args_t));
+ args.name = name->name;
+ args.namelen = name->len;
+ args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args.inumber = ino;
+ args.dp = dp;
+ args.firstblock = first;
+ args.flist = flist;
+ args.total = total;
+ args.whichfork = XFS_DATA_FORK;
+ args.trans = tp;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ rval = xfs_dir2_sf_removename(&args);
+ else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+ return rval;
+ else if (v)
+ rval = xfs_dir2_block_removename(&args);
+ else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+ return rval;
+ else if (v)
+ rval = xfs_dir2_leaf_removename(&args);
+ else
+ rval = xfs_dir2_node_removename(&args);
+ return rval;
+}
+
+/*
+ * Read a directory.
+ */
+int
+xfs_readdir(
+ xfs_inode_t *dp,
+ void *dirent,
+ size_t bufsize,
+ xfs_off_t *offset,
+ filldir_t filldir)
+{
+ int rval; /* return value */
+ int v; /* type-checking value */
+
+ trace_xfs_readdir(dp);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return XFS_ERROR(EIO);
+
+ ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+ XFS_STATS_INC(xs_dir_getdents);
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir);
+ else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
+ ;
+ else if (v)
+ rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir);
+ else
+ rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset,
+ filldir);
+ return rval;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+int
+xfs_dir_replace(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ struct xfs_name *name, /* name of entry to replace */
+ xfs_ino_t inum, /* new inode number */
+ xfs_fsblock_t *first, /* bmap's firstblock */
+ xfs_bmap_free_t *flist, /* bmap's freeblock list */
+ xfs_extlen_t total) /* bmap's total block count */
+{
+ xfs_da_args_t args;
+ int rval;
+ int v; /* type-checking value */
+
+ ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+
+ if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
+ return rval;
+
+ memset(&args, 0, sizeof(xfs_da_args_t));
+ args.name = name->name;
+ args.namelen = name->len;
+ args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args.inumber = inum;
+ args.dp = dp;
+ args.firstblock = first;
+ args.flist = flist;
+ args.total = total;
+ args.whichfork = XFS_DATA_FORK;
+ args.trans = tp;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ rval = xfs_dir2_sf_replace(&args);
+ else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+ return rval;
+ else if (v)
+ rval = xfs_dir2_block_replace(&args);
+ else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+ return rval;
+ else if (v)
+ rval = xfs_dir2_leaf_replace(&args);
+ else
+ rval = xfs_dir2_node_replace(&args);
+ return rval;
+}
+
+/*
+ * See if this entry can be added to the directory without allocating space.
+ * First checks that the caller couldn't reserve enough space (resblks = 0).
+ */
+int
+xfs_dir_canenter(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ struct xfs_name *name, /* name of entry to add */
+ uint resblks)
+{
+ xfs_da_args_t args;
+ int rval;
+ int v; /* type-checking value */
+
+ if (resblks)
+ return 0;
+
+ ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+
+ memset(&args, 0, sizeof(xfs_da_args_t));
+ args.name = name->name;
+ args.namelen = name->len;
+ args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args.dp = dp;
+ args.whichfork = XFS_DATA_FORK;
+ args.trans = tp;
+ args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+ XFS_DA_OP_OKNOENT;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ rval = xfs_dir2_sf_addname(&args);
+ else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+ return rval;
+ else if (v)
+ rval = xfs_dir2_block_addname(&args);
+ else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+ return rval;
+ else if (v)
+ rval = xfs_dir2_leaf_addname(&args);
+ else
+ rval = xfs_dir2_node_addname(&args);
+ return rval;
+}
+
+/*
+ * Utility routines.
+ */
+
+/*
+ * Add a block to the directory.
+ * This routine is for data and free blocks, not leaf/node blocks
+ * which are handled by xfs_da_grow_inode.
+ */
+int
+xfs_dir2_grow_inode(
+ xfs_da_args_t *args,
+ int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */
+ xfs_dir2_db_t *dbp) /* out: block number added */
+{
+ xfs_fileoff_t bno; /* directory offset of new block */
+ int count; /* count of filesystem blocks */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error;
+ int got; /* blocks actually mapped */
+ int i;
+ xfs_bmbt_irec_t map; /* single structure for bmap */
+ int mapi; /* mapping index */
+ xfs_bmbt_irec_t *mapp; /* bmap mapping structure(s) */
+ xfs_mount_t *mp;
+ int nmap; /* number of bmap entries */
+ xfs_trans_t *tp;
+ xfs_drfsbno_t nblks;
+
+ trace_xfs_dir2_grow_inode(args, space);
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ nblks = dp->i_d.di_nblocks;
+ /*
+ * Set lowest possible block in the space requested.
+ */
+ bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
+ count = mp->m_dirblkfsbs;
+ /*
+ * Find the first hole for our block.
+ */
+ if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK)))
+ return error;
+ nmap = 1;
+ ASSERT(args->firstblock != NULL);
+ /*
+ * Try mapping the new block contiguously (one extent).
+ */
+ if ((error = xfs_bmapi(tp, dp, bno, count,
+ XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
+ args->firstblock, args->total, &map, &nmap,
+ args->flist)))
+ return error;
+ ASSERT(nmap <= 1);
+ if (nmap == 1) {
+ mapp = &map;
+ mapi = 1;
+ }
+ /*
+ * Didn't work and this is a multiple-fsb directory block.
+ * Try again with contiguous flag turned on.
+ */
+ else if (nmap == 0 && count > 1) {
+ xfs_fileoff_t b; /* current file offset */
+
+ /*
+ * Space for maximum number of mappings.
+ */
+ mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+ /*
+ * Iterate until we get to the end of our block.
+ */
+ for (b = bno, mapi = 0; b < bno + count; ) {
+ int c; /* current fsb count */
+
+ /*
+ * Can't map more than MAX_NMAP at once.
+ */
+ nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+ c = (int)(bno + count - b);
+ if ((error = xfs_bmapi(tp, dp, b, c,
+ XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
+ args->firstblock, args->total,
+ &mapp[mapi], &nmap, args->flist))) {
+ kmem_free(mapp);
+ return error;
+ }
+ if (nmap < 1)
+ break;
+ /*
+ * Add this bunch into our table, go to the next offset.
+ */
+ mapi += nmap;
+ b = mapp[mapi - 1].br_startoff +
+ mapp[mapi - 1].br_blockcount;
+ }
+ }
+ /*
+ * Didn't work.
+ */
+ else {
+ mapi = 0;
+ mapp = NULL;
+ }
+ /*
+ * See how many fsb's we got.
+ */
+ for (i = 0, got = 0; i < mapi; i++)
+ got += mapp[i].br_blockcount;
+ /*
+ * Didn't get enough fsb's, or the first/last block's are wrong.
+ */
+ if (got != count || mapp[0].br_startoff != bno ||
+ mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+ bno + count) {
+ if (mapp != &map)
+ kmem_free(mapp);
+ return XFS_ERROR(ENOSPC);
+ }
+ /*
+ * Done with the temporary mapping table.
+ */
+ if (mapp != &map)
+ kmem_free(mapp);
+
+ /* account for newly allocated blocks in reserved blocks total */
+ args->total -= dp->i_d.di_nblocks - nblks;
+ *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
+
+ /*
+ * Update file's size if this is the data space and it grew.
+ */
+ if (space == XFS_DIR2_DATA_SPACE) {
+ xfs_fsize_t size; /* directory file (data) size */
+
+ size = XFS_FSB_TO_B(mp, bno + count);
+ if (size > dp->i_d.di_size) {
+ dp->i_d.di_size = size;
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ }
+ }
+ return 0;
+}
+
+/*
+ * See if the directory is a single-block form directory.
+ */
+int
+xfs_dir2_isblock(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ int *vp) /* out: 1 is block, 0 is not block */
+{
+ xfs_fileoff_t last; /* last file offset */
+ xfs_mount_t *mp;
+ int rval;
+
+ mp = dp->i_mount;
+ if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+ return rval;
+ rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
+ ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
+ *vp = rval;
+ return 0;
+}
+
+/*
+ * See if the directory is a single-leaf form directory.
+ */
+int
+xfs_dir2_isleaf(
+ xfs_trans_t *tp,
+ xfs_inode_t *dp,
+ int *vp) /* out: 1 is leaf, 0 is not leaf */
+{
+ xfs_fileoff_t last; /* last file offset */
+ xfs_mount_t *mp;
+ int rval;
+
+ mp = dp->i_mount;
+ if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+ return rval;
+ *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
+ return 0;
+}
+
+/*
+ * Remove the given block from the directory.
+ * This routine is used for data and free blocks, leaf/node are done
+ * by xfs_da_shrink_inode.
+ */
+int
+xfs_dir2_shrink_inode(
+ xfs_da_args_t *args,
+ xfs_dir2_db_t db,
+ xfs_dabuf_t *bp)
+{
+ xfs_fileoff_t bno; /* directory file offset */
+ xfs_dablk_t da; /* directory file offset */
+ int done; /* bunmap is finished */
+ xfs_inode_t *dp;
+ int error;
+ xfs_mount_t *mp;
+ xfs_trans_t *tp;
+
+ trace_xfs_dir2_shrink_inode(args, db);
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ tp = args->trans;
+ da = xfs_dir2_db_to_da(mp, db);
+ /*
+ * Unmap the fsblock(s).
+ */
+ if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
+ XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
+ &done))) {
+ /*
+ * ENOSPC actually can happen if we're in a removename with
+ * no space reservation, and the resulting block removal
+ * would cause a bmap btree split or conversion from extents
+ * to btree. This can only happen for un-fragmented
+ * directory blocks, since you need to be punching out
+ * the middle of an extent.
+ * In this case we need to leave the block in the file,
+ * and not binval it.
+ * So the block has to be in a consistent empty state
+ * and appropriately logged.
+ * We don't free up the buffer, the caller can tell it
+ * hasn't happened since it got an error back.
+ */
+ return error;
+ }
+ ASSERT(done);
+ /*
+ * Invalidate the buffer from the transaction.
+ */
+ xfs_da_binval(tp, bp);
+ /*
+ * If it's not a data block, we're done.
+ */
+ if (db >= XFS_DIR2_LEAF_FIRSTDB(mp))
+ return 0;
+ /*
+ * If the block isn't the last one in the directory, we're done.
+ */
+ if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0))
+ return 0;
+ bno = da;
+ if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
+ /*
+ * This can't really happen unless there's kernel corruption.
+ */
+ return error;
+ }
+ if (db == mp->m_dirdatablk)
+ ASSERT(bno == 0);
+ else
+ ASSERT(bno > 0);
+ /*
+ * Set the size to the new last block.
+ */
+ dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ return 0;
+}
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
new file mode 100644
index 00000000..74a3b105
--- /dev/null
+++ b/fs/xfs/xfs_dir2.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DIR2_H__
+#define __XFS_DIR2_H__
+
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_put_args;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Directory version 2.
+ * There are 4 possible formats:
+ * shortform
+ * single block - data with embedded leaf at the end
+ * multiple data blocks, single leaf+freeindex block
+ * data blocks, node&leaf blocks (btree), freeindex blocks
+ *
+ * The shortform format is in xfs_dir2_sf.h.
+ * The single block format is in xfs_dir2_block.h.
+ * The data block format is in xfs_dir2_data.h.
+ * The leaf and freeindex block formats are in xfs_dir2_leaf.h.
+ * Node blocks are the same as the other version, in xfs_da_btree.h.
+ */
+
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef __uint16_t xfs_dir2_data_off_t;
+#define NULLDATAOFF 0xffffU
+typedef uint xfs_dir2_data_aoff_t; /* argument form */
+
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef __uint32_t xfs_dir2_db_t;
+
+/*
+ * Byte offset in a directory.
+ */
+typedef xfs_off_t xfs_dir2_off_t;
+
+extern struct xfs_name xfs_name_dotdot;
+
+/*
+ * Generic directory interface routines
+ */
+extern void xfs_dir_startup(void);
+extern void xfs_dir_mount(struct xfs_mount *mp);
+extern int xfs_dir_isempty(struct xfs_inode *dp);
+extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_inode *pdp);
+extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name, xfs_ino_t inum,
+ xfs_fsblock_t *first,
+ struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name, xfs_ino_t *inum,
+ struct xfs_name *ci_name);
+extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name, xfs_ino_t ino,
+ xfs_fsblock_t *first,
+ struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name, xfs_ino_t inum,
+ xfs_fsblock_t *first,
+ struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
+ struct xfs_name *name, uint resblks);
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
+/*
+ * Utility routines for v2 directories.
+ */
+extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+ xfs_dir2_db_t *dbp);
+extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp,
+ int *vp);
+extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
+ int *vp);
+extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+ struct xfs_dabuf *bp);
+
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
+ const unsigned char *name, int len);
+
+#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
new file mode 100644
index 00000000..580d99ce
--- /dev/null
+++ b/fs/xfs/xfs_dir2_block.c
@@ -0,0 +1,1233 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * Local function prototypes.
+ */
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first,
+ int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp,
+ int *entno);
+static int xfs_dir2_block_sort(const void *a, const void *b);
+
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+ xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
+ xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
+}
+
+/*
+ * Add an entry to a block directory.
+ */
+int /* error */
+xfs_dir2_block_addname(
+ xfs_da_args_t *args) /* directory op arguments */
+{
+ xfs_dir2_data_free_t *bf; /* bestfree table in block */
+ xfs_dir2_block_t *block; /* directory block structure */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
+ xfs_dabuf_t *bp; /* buffer for block */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ int compact; /* need to compact leaf ents */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_inode_t *dp; /* directory inode */
+ xfs_dir2_data_unused_t *dup; /* block unused entry */
+ int error; /* error return value */
+ xfs_dir2_data_unused_t *enddup=NULL; /* unused at end of data */
+ xfs_dahash_t hash; /* hash value of found entry */
+ int high; /* high index for binary srch */
+ int highstale; /* high stale index */
+ int lfloghigh=0; /* last final leaf to log */
+ int lfloglow=0; /* first final leaf to log */
+ int len; /* length of the new entry */
+ int low; /* low index for binary srch */
+ int lowstale; /* low stale index */
+ int mid=0; /* midpoint for binary srch */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int needlog; /* need to log header */
+ int needscan; /* need to rescan freespace */
+ __be16 *tagp; /* pointer to tag value */
+ xfs_trans_t *tp; /* transaction structure */
+
+ trace_xfs_dir2_block_addname(args);
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ /*
+ * Read the (one and only) directory block into dabuf bp.
+ */
+ if ((error =
+ xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+ return error;
+ }
+ ASSERT(bp != NULL);
+ block = bp->data;
+ /*
+ * Check the magic number, corrupted if wrong.
+ */
+ if (unlikely(be32_to_cpu(block->hdr.magic) != XFS_DIR2_BLOCK_MAGIC)) {
+ XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
+ XFS_ERRLEVEL_LOW, mp, block);
+ xfs_da_brelse(tp, bp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ len = xfs_dir2_data_entsize(args->namelen);
+ /*
+ * Set up pointers to parts of the block.
+ */
+ bf = block->hdr.bestfree;
+ btp = xfs_dir2_block_tail_p(mp, block);
+ blp = xfs_dir2_block_leaf_p(btp);
+ /*
+ * No stale entries? Need space for entry and new leaf.
+ */
+ if (!btp->stale) {
+ /*
+ * Tag just before the first leaf entry.
+ */
+ tagp = (__be16 *)blp - 1;
+ /*
+ * Data object just before the first leaf entry.
+ */
+ enddup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp));
+ /*
+ * If it's not free then can't do this add without cleaning up:
+ * the space before the first leaf entry needs to be free so it
+ * can be expanded to hold the pointer to the new entry.
+ */
+ if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+ dup = enddup = NULL;
+ /*
+ * Check out the biggest freespace and see if it's the same one.
+ */
+ else {
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)block + be16_to_cpu(bf[0].offset));
+ if (dup == enddup) {
+ /*
+ * It is the biggest freespace, is it too small
+ * to hold the new leaf too?
+ */
+ if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+ /*
+ * Yes, we use the second-largest
+ * entry instead if it works.
+ */
+ if (be16_to_cpu(bf[1].length) >= len)
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)block +
+ be16_to_cpu(bf[1].offset));
+ else
+ dup = NULL;
+ }
+ } else {
+ /*
+ * Not the same free entry,
+ * just check its length.
+ */
+ if (be16_to_cpu(dup->length) < len) {
+ dup = NULL;
+ }
+ }
+ }
+ compact = 0;
+ }
+ /*
+ * If there are stale entries we'll use one for the leaf.
+ * Is the biggest entry enough to avoid compaction?
+ */
+ else if (be16_to_cpu(bf[0].length) >= len) {
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)block + be16_to_cpu(bf[0].offset));
+ compact = 0;
+ }
+ /*
+ * Will need to compact to make this work.
+ */
+ else {
+ /*
+ * Tag just before the first leaf entry.
+ */
+ tagp = (__be16 *)blp - 1;
+ /*
+ * Data object just before the first leaf entry.
+ */
+ dup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp));
+ /*
+ * If it's not free then the data will go where the
+ * leaf data starts now, if it works at all.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+ (uint)sizeof(*blp) < len)
+ dup = NULL;
+ } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+ dup = NULL;
+ else
+ dup = (xfs_dir2_data_unused_t *)blp;
+ compact = 1;
+ }
+ /*
+ * If this isn't a real add, we're done with the buffer.
+ */
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+ xfs_da_brelse(tp, bp);
+ /*
+ * If we don't have space for the new entry & leaf ...
+ */
+ if (!dup) {
+ /*
+ * Not trying to actually do anything, or don't have
+ * a space reservation: return no-space.
+ */
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+ return XFS_ERROR(ENOSPC);
+ /*
+ * Convert to the next larger format.
+ * Then add the new entry in that format.
+ */
+ error = xfs_dir2_block_to_leaf(args, bp);
+ xfs_da_buf_done(bp);
+ if (error)
+ return error;
+ return xfs_dir2_leaf_addname(args);
+ }
+ /*
+ * Just checking, and it would work, so say so.
+ */
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+ return 0;
+ needlog = needscan = 0;
+ /*
+ * If need to compact the leaf entries, do it now.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+ if (compact) {
+ int fromidx; /* source leaf index */
+ int toidx; /* target leaf index */
+
+ for (fromidx = toidx = be32_to_cpu(btp->count) - 1,
+ highstale = lfloghigh = -1;
+ fromidx >= 0;
+ fromidx--) {
+ if (be32_to_cpu(blp[fromidx].address) == XFS_DIR2_NULL_DATAPTR) {
+ if (highstale == -1)
+ highstale = toidx;
+ else {
+ if (lfloghigh == -1)
+ lfloghigh = toidx;
+ continue;
+ }
+ }
+ if (fromidx < toidx)
+ blp[toidx] = blp[fromidx];
+ toidx--;
+ }
+ lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+ lfloghigh -= be32_to_cpu(btp->stale) - 1;
+ be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+ xfs_dir2_data_make_free(tp, bp,
+ (xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
+ (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+ &needlog, &needscan);
+ blp += be32_to_cpu(btp->stale) - 1;
+ btp->stale = cpu_to_be32(1);
+ /*
+ * If we now need to rebuild the bestfree map, do so.
+ * This needs to happen before the next call to use_free.
+ */
+ if (needscan) {
+ xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
+ needscan = 0;
+ }
+ }
+ /*
+ * Set leaf logging boundaries to impossible state.
+ * For the no-stale case they're set explicitly.
+ */
+ else if (btp->stale) {
+ lfloglow = be32_to_cpu(btp->count);
+ lfloghigh = -1;
+ }
+ /*
+ * Find the slot that's first lower than our hash value, -1 if none.
+ */
+ for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) {
+ mid = (low + high) >> 1;
+ if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+ break;
+ if (hash < args->hashval)
+ low = mid + 1;
+ else
+ high = mid - 1;
+ }
+ while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) {
+ mid--;
+ }
+ /*
+ * No stale entries, will use enddup space to hold new leaf.
+ */
+ if (!btp->stale) {
+ /*
+ * Mark the space needed for the new leaf entry, now in use.
+ */
+ xfs_dir2_data_use_free(tp, bp, enddup,
+ (xfs_dir2_data_aoff_t)
+ ((char *)enddup - (char *)block + be16_to_cpu(enddup->length) -
+ sizeof(*blp)),
+ (xfs_dir2_data_aoff_t)sizeof(*blp),
+ &needlog, &needscan);
+ /*
+ * Update the tail (entry count).
+ */
+ be32_add_cpu(&btp->count, 1);
+ /*
+ * If we now need to rebuild the bestfree map, do so.
+ * This needs to happen before the next call to use_free.
+ */
+ if (needscan) {
+ xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
+ &needlog);
+ needscan = 0;
+ }
+ /*
+ * Adjust pointer to the first leaf entry, we're about to move
+ * the table up one to open up space for the new leaf entry.
+ * Then adjust our index to match.
+ */
+ blp--;
+ mid++;
+ if (mid)
+ memmove(blp, &blp[1], mid * sizeof(*blp));
+ lfloglow = 0;
+ lfloghigh = mid;
+ }
+ /*
+ * Use a stale leaf for our new entry.
+ */
+ else {
+ for (lowstale = mid;
+ lowstale >= 0 &&
+ be32_to_cpu(blp[lowstale].address) != XFS_DIR2_NULL_DATAPTR;
+ lowstale--)
+ continue;
+ for (highstale = mid + 1;
+ highstale < be32_to_cpu(btp->count) &&
+ be32_to_cpu(blp[highstale].address) != XFS_DIR2_NULL_DATAPTR &&
+ (lowstale < 0 || mid - lowstale > highstale - mid);
+ highstale++)
+ continue;
+ /*
+ * Move entries toward the low-numbered stale entry.
+ */
+ if (lowstale >= 0 &&
+ (highstale == be32_to_cpu(btp->count) ||
+ mid - lowstale <= highstale - mid)) {
+ if (mid - lowstale)
+ memmove(&blp[lowstale], &blp[lowstale + 1],
+ (mid - lowstale) * sizeof(*blp));
+ lfloglow = MIN(lowstale, lfloglow);
+ lfloghigh = MAX(mid, lfloghigh);
+ }
+ /*
+ * Move entries toward the high-numbered stale entry.
+ */
+ else {
+ ASSERT(highstale < be32_to_cpu(btp->count));
+ mid++;
+ if (highstale - mid)
+ memmove(&blp[mid + 1], &blp[mid],
+ (highstale - mid) * sizeof(*blp));
+ lfloglow = MIN(mid, lfloglow);
+ lfloghigh = MAX(highstale, lfloghigh);
+ }
+ be32_add_cpu(&btp->stale, -1);
+ }
+ /*
+ * Point to the new data entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)dup;
+ /*
+ * Fill in the leaf entry.
+ */
+ blp[mid].hashval = cpu_to_be32(args->hashval);
+ blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ (char *)dep - (char *)block));
+ xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
+ /*
+ * Mark space for the data entry used.
+ */
+ xfs_dir2_data_use_free(tp, bp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
+ (xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+ /*
+ * Create the new data entry.
+ */
+ dep->inumber = cpu_to_be64(args->inumber);
+ dep->namelen = args->namelen;
+ memcpy(dep->name, args->name, args->namelen);
+ tagp = xfs_dir2_data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)block);
+ /*
+ * Clean up the bestfree array and log the header, tail, and entry.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(tp, bp);
+ xfs_dir2_block_log_tail(tp, bp);
+ xfs_dir2_data_log_entry(tp, bp, dep);
+ xfs_dir2_data_check(dp, bp);
+ xfs_da_buf_done(bp);
+ return 0;
+}
+
+/*
+ * Readdir for block directories.
+ */
+int /* error */
+xfs_dir2_block_getdents(
+ xfs_inode_t *dp, /* incore inode */
+ void *dirent,
+ xfs_off_t *offset,
+ filldir_t filldir)
+{
+ xfs_dir2_block_t *block; /* directory block structure */
+ xfs_dabuf_t *bp; /* buffer for block */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_dir2_data_unused_t *dup; /* block unused entry */
+ char *endptr; /* end of the data entries */
+ int error; /* error return value */
+ xfs_mount_t *mp; /* filesystem mount point */
+ char *ptr; /* current data entry */
+ int wantoff; /* starting block offset */
+ xfs_off_t cook;
+
+ mp = dp->i_mount;
+ /*
+ * If the block number in the offset is out of range, we're done.
+ */
+ if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) {
+ return 0;
+ }
+ /*
+ * Can't read the block, give up, else get dabuf in bp.
+ */
+ error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
+ &bp, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ ASSERT(bp != NULL);
+ /*
+ * Extract the byte offset we start at from the seek pointer.
+ * We'll skip entries before this.
+ */
+ wantoff = xfs_dir2_dataptr_to_off(mp, *offset);
+ block = bp->data;
+ xfs_dir2_data_check(dp, bp);
+ /*
+ * Set up values for the loop.
+ */
+ btp = xfs_dir2_block_tail_p(mp, block);
+ ptr = (char *)block->u;
+ endptr = (char *)xfs_dir2_block_leaf_p(btp);
+
+ /*
+ * Loop over the data portion of the block.
+ * Each object is a real entry (dep) or an unused one (dup).
+ */
+ while (ptr < endptr) {
+ dup = (xfs_dir2_data_unused_t *)ptr;
+ /*
+ * Unused, skip it.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ ptr += be16_to_cpu(dup->length);
+ continue;
+ }
+
+ dep = (xfs_dir2_data_entry_t *)ptr;
+
+ /*
+ * Bump pointer for the next iteration.
+ */
+ ptr += xfs_dir2_data_entsize(dep->namelen);
+ /*
+ * The entry is before the desired starting point, skip it.
+ */
+ if ((char *)dep - (char *)block < wantoff)
+ continue;
+
+ cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+ (char *)dep - (char *)block);
+
+ /*
+ * If it didn't fit, set the final offset to here & return.
+ */
+ if (filldir(dirent, (char *)dep->name, dep->namelen,
+ cook & 0x7fffffff, be64_to_cpu(dep->inumber),
+ DT_UNKNOWN)) {
+ *offset = cook & 0x7fffffff;
+ xfs_da_brelse(NULL, bp);
+ return 0;
+ }
+ }
+
+ /*
+ * Reached the end of the block.
+ * Set the offset to a non-existent block 1 and return.
+ */
+ *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+ 0x7fffffff;
+ xfs_da_brelse(NULL, bp);
+ return 0;
+}
+
+/*
+ * Log leaf entries from the block.
+ */
+static void
+xfs_dir2_block_log_leaf(
+ xfs_trans_t *tp, /* transaction structure */
+ xfs_dabuf_t *bp, /* block buffer */
+ int first, /* index of first logged leaf */
+ int last) /* index of last logged leaf */
+{
+ xfs_dir2_block_t *block; /* directory block structure */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_mount_t *mp; /* filesystem mount point */
+
+ mp = tp->t_mountp;
+ block = bp->data;
+ btp = xfs_dir2_block_tail_p(mp, block);
+ blp = xfs_dir2_block_leaf_p(btp);
+ xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block),
+ (uint)((char *)&blp[last + 1] - (char *)block - 1));
+}
+
+/*
+ * Log the block tail.
+ */
+static void
+xfs_dir2_block_log_tail(
+ xfs_trans_t *tp, /* transaction structure */
+ xfs_dabuf_t *bp) /* block buffer */
+{
+ xfs_dir2_block_t *block; /* directory block structure */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_mount_t *mp; /* filesystem mount point */
+
+ mp = tp->t_mountp;
+ block = bp->data;
+ btp = xfs_dir2_block_tail_p(mp, block);
+ xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block),
+ (uint)((char *)(btp + 1) - (char *)block - 1));
+}
+
+/*
+ * Look up an entry in the block. This is the external routine,
+ * xfs_dir2_block_lookup_int does the real work.
+ */
+int /* error */
+xfs_dir2_block_lookup(
+ xfs_da_args_t *args) /* dir lookup arguments */
+{
+ xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
+ xfs_dabuf_t *bp; /* block buffer */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_inode_t *dp; /* incore inode */
+ int ent; /* entry index */
+ int error; /* error return value */
+ xfs_mount_t *mp; /* filesystem mount point */
+
+ trace_xfs_dir2_block_lookup(args);
+
+ /*
+ * Get the buffer, look up the entry.
+ * If not found (ENOENT) then return, have no buffer.
+ */
+ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
+ return error;
+ dp = args->dp;
+ mp = dp->i_mount;
+ block = bp->data;
+ xfs_dir2_data_check(dp, bp);
+ btp = xfs_dir2_block_tail_p(mp, block);
+ blp = xfs_dir2_block_leaf_p(btp);
+ /*
+ * Get the offset from the leaf entry, to point to the data.
+ */
+ dep = (xfs_dir2_data_entry_t *)((char *)block +
+ xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ /*
+ * Fill in inode number, CI name if appropriate, release the block.
+ */
+ args->inumber = be64_to_cpu(dep->inumber);
+ error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+ xfs_da_brelse(args->trans, bp);
+ return XFS_ERROR(error);
+}
+
+/*
+ * Internal block lookup routine.
+ */
+static int /* error */
+xfs_dir2_block_lookup_int(
+ xfs_da_args_t *args, /* dir lookup arguments */
+ xfs_dabuf_t **bpp, /* returned block buffer */
+ int *entno) /* returned entry number */
+{
+ xfs_dir2_dataptr_t addr; /* data entry address */
+ xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
+ xfs_dabuf_t *bp; /* block buffer */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_inode_t *dp; /* incore inode */
+ int error; /* error return value */
+ xfs_dahash_t hash; /* found hash value */
+ int high; /* binary search high index */
+ int low; /* binary search low index */
+ int mid; /* binary search current idx */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_trans_t *tp; /* transaction pointer */
+ enum xfs_dacmp cmp; /* comparison result */
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ /*
+ * Read the buffer, return error if we can't get it.
+ */
+ if ((error =
+ xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+ return error;
+ }
+ ASSERT(bp != NULL);
+ block = bp->data;
+ xfs_dir2_data_check(dp, bp);
+ btp = xfs_dir2_block_tail_p(mp, block);
+ blp = xfs_dir2_block_leaf_p(btp);
+ /*
+ * Loop doing a binary search for our hash value.
+ * Find our entry, ENOENT if it's not there.
+ */
+ for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) {
+ ASSERT(low <= high);
+ mid = (low + high) >> 1;
+ if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval)
+ break;
+ if (hash < args->hashval)
+ low = mid + 1;
+ else
+ high = mid - 1;
+ if (low > high) {
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ xfs_da_brelse(tp, bp);
+ return XFS_ERROR(ENOENT);
+ }
+ }
+ /*
+ * Back up to the first one with the right hash value.
+ */
+ while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) {
+ mid--;
+ }
+ /*
+ * Now loop forward through all the entries with the
+ * right hash value looking for our name.
+ */
+ do {
+ if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Get pointer to the entry from the leaf.
+ */
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)block + xfs_dir2_dataptr_to_off(mp, addr));
+ /*
+ * Compare name and if it's an exact match, return the index
+ * and buffer. If it's the first case-insensitive match, store
+ * the index and buffer and continue looking for an exact match.
+ */
+ cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
+ *bpp = bp;
+ *entno = mid;
+ if (cmp == XFS_CMP_EXACT)
+ return 0;
+ }
+ } while (++mid < be32_to_cpu(btp->count) &&
+ be32_to_cpu(blp[mid].hashval) == hash);
+
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ /*
+ * Here, we can only be doing a lookup (not a rename or replace).
+ * If a case-insensitive match was found earlier, return success.
+ */
+ if (args->cmpresult == XFS_CMP_CASE)
+ return 0;
+ /*
+ * No match, release the buffer and return ENOENT.
+ */
+ xfs_da_brelse(tp, bp);
+ return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Remove an entry from a block format directory.
+ * If that makes the block small enough to fit in shortform, transform it.
+ */
+int /* error */
+xfs_dir2_block_removename(
+ xfs_da_args_t *args) /* directory operation args */
+{
+ xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */
+ xfs_dabuf_t *bp; /* block buffer */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_inode_t *dp; /* incore inode */
+ int ent; /* block leaf entry index */
+ int error; /* error return value */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int needlog; /* need to log block header */
+ int needscan; /* need to fixup bestfree */
+ xfs_dir2_sf_hdr_t sfh; /* shortform header */
+ int size; /* shortform size */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ trace_xfs_dir2_block_removename(args);
+
+ /*
+ * Look up the entry in the block. Gets the buffer and entry index.
+ * It will always be there, the vnodeops level does a lookup first.
+ */
+ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+ return error;
+ }
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ block = bp->data;
+ btp = xfs_dir2_block_tail_p(mp, block);
+ blp = xfs_dir2_block_leaf_p(btp);
+ /*
+ * Point to the data entry using the leaf entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ /*
+ * Mark the data entry's space free.
+ */
+ needlog = needscan = 0;
+ xfs_dir2_data_make_free(tp, bp,
+ (xfs_dir2_data_aoff_t)((char *)dep - (char *)block),
+ xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+ /*
+ * Fix up the block tail.
+ */
+ be32_add_cpu(&btp->stale, 1);
+ xfs_dir2_block_log_tail(tp, bp);
+ /*
+ * Remove the leaf entry by marking it stale.
+ */
+ blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+ xfs_dir2_block_log_leaf(tp, bp, ent, ent);
+ /*
+ * Fix up bestfree, log the header if necessary.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(tp, bp);
+ xfs_dir2_data_check(dp, bp);
+ /*
+ * See if the size as a shortform is good enough.
+ */
+ if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
+ XFS_IFORK_DSIZE(dp)) {
+ xfs_da_buf_done(bp);
+ return 0;
+ }
+ /*
+ * If it works, do the conversion.
+ */
+ return xfs_dir2_block_to_sf(args, bp, size, &sfh);
+}
+
+/*
+ * Replace an entry in a V2 block directory.
+ * Change the inode number to the new value.
+ */
+int /* error */
+xfs_dir2_block_replace(
+ xfs_da_args_t *args) /* directory operation args */
+{
+ xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
+ xfs_dabuf_t *bp; /* block buffer */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* block data entry */
+ xfs_inode_t *dp; /* incore inode */
+ int ent; /* leaf entry index */
+ int error; /* error return value */
+ xfs_mount_t *mp; /* filesystem mount point */
+
+ trace_xfs_dir2_block_replace(args);
+
+ /*
+ * Lookup the entry in the directory. Get buffer and entry index.
+ * This will always succeed since the caller has already done a lookup.
+ */
+ if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+ return error;
+ }
+ dp = args->dp;
+ mp = dp->i_mount;
+ block = bp->data;
+ btp = xfs_dir2_block_tail_p(mp, block);
+ blp = xfs_dir2_block_leaf_p(btp);
+ /*
+ * Point to the data entry we need to change.
+ */
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
+ /*
+ * Change the inode number to the new value.
+ */
+ dep->inumber = cpu_to_be64(args->inumber);
+ xfs_dir2_data_log_entry(args->trans, bp, dep);
+ xfs_dir2_data_check(dp, bp);
+ xfs_da_buf_done(bp);
+ return 0;
+}
+
+/*
+ * Qsort comparison routine for the block leaf entries.
+ */
+static int /* sort order */
+xfs_dir2_block_sort(
+ const void *a, /* first leaf entry */
+ const void *b) /* second leaf entry */
+{
+ const xfs_dir2_leaf_entry_t *la; /* first leaf entry */
+ const xfs_dir2_leaf_entry_t *lb; /* second leaf entry */
+
+ la = a;
+ lb = b;
+ return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 :
+ (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0);
+}
+
+/*
+ * Convert a V2 leaf directory to a V2 block directory if possible.
+ */
+int /* error */
+xfs_dir2_leaf_to_block(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dabuf_t *lbp, /* leaf buffer */
+ xfs_dabuf_t *dbp) /* data buffer */
+{
+ __be16 *bestsp; /* leaf bests table */
+ xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_inode_t *dp; /* incore directory inode */
+ xfs_dir2_data_unused_t *dup; /* unused data entry */
+ int error; /* error return value */
+ int from; /* leaf from index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+ xfs_mount_t *mp; /* file system mount point */
+ int needlog; /* need to log data header */
+ int needscan; /* need to scan for bestfree */
+ xfs_dir2_sf_hdr_t sfh; /* shortform header */
+ int size; /* bytes used */
+ __be16 *tagp; /* end of entry (tag) */
+ int to; /* block/leaf to index */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ trace_xfs_dir2_leaf_to_block(args);
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ leaf = lbp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC);
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ /*
+ * If there are data blocks other than the first one, take this
+ * opportunity to remove trailing empty data blocks that may have
+ * been left behind during no-space-reservation operations.
+ * These will show up in the leaf bests table.
+ */
+ while (dp->i_d.di_size > mp->m_dirblksize) {
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
+ mp->m_dirblksize - (uint)sizeof(block->hdr)) {
+ if ((error =
+ xfs_dir2_leaf_trim_data(args, lbp,
+ (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
+ goto out;
+ } else {
+ error = 0;
+ goto out;
+ }
+ }
+ /*
+ * Read the data block if we don't already have it, give up if it fails.
+ */
+ if (dbp == NULL &&
+ (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
+ XFS_DATA_FORK))) {
+ goto out;
+ }
+ block = dbp->data;
+ ASSERT(be32_to_cpu(block->hdr.magic) == XFS_DIR2_DATA_MAGIC);
+ /*
+ * Size of the "leaf" area in the block.
+ */
+ size = (uint)sizeof(block->tail) +
+ (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale));
+ /*
+ * Look at the last data entry.
+ */
+ tagp = (__be16 *)((char *)block + mp->m_dirblksize) - 1;
+ dup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp));
+ /*
+ * If it's not free or is too short we can't do it.
+ */
+ if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG ||
+ be16_to_cpu(dup->length) < size) {
+ error = 0;
+ goto out;
+ }
+ /*
+ * Start converting it to block form.
+ */
+ block->hdr.magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+ needlog = 1;
+ needscan = 0;
+ /*
+ * Use up the space at the end of the block (blp/btp).
+ */
+ xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size,
+ &needlog, &needscan);
+ /*
+ * Initialize the block tail.
+ */
+ btp = xfs_dir2_block_tail_p(mp, block);
+ btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale));
+ btp->stale = 0;
+ xfs_dir2_block_log_tail(tp, dbp);
+ /*
+ * Initialize the block leaf area. We compact out stale entries.
+ */
+ lep = xfs_dir2_block_leaf_p(btp);
+ for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) {
+ if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ lep[to++] = leaf->ents[from];
+ }
+ ASSERT(to == be32_to_cpu(btp->count));
+ xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
+ /*
+ * Scan the bestfree if we need it and log the data block header.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(tp, dbp);
+ /*
+ * Pitch the old leaf block.
+ */
+ error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp);
+ lbp = NULL;
+ if (error) {
+ goto out;
+ }
+ /*
+ * Now see if the resulting block can be shrunken to shortform.
+ */
+ if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
+ XFS_IFORK_DSIZE(dp)) {
+ error = 0;
+ goto out;
+ }
+ return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
+out:
+ if (lbp)
+ xfs_da_buf_done(lbp);
+ if (dbp)
+ xfs_da_buf_done(dbp);
+ return error;
+}
+
+/*
+ * Convert the shortform directory to block form.
+ */
+int /* error */
+xfs_dir2_sf_to_block(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_dir2_db_t blkno; /* dir-relative block # (0) */
+ xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
+ xfs_dabuf_t *bp; /* block buffer */
+ xfs_dir2_block_tail_t *btp; /* block tail pointer */
+ char *buf; /* sf buffer */
+ int buf_len;
+ xfs_dir2_data_entry_t *dep; /* data entry pointer */
+ xfs_inode_t *dp; /* incore directory inode */
+ int dummy; /* trash */
+ xfs_dir2_data_unused_t *dup; /* unused entry pointer */
+ int endoffset; /* end of data objects */
+ int error; /* error return value */
+ int i; /* index */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int needlog; /* need to log block header */
+ int needscan; /* need to scan block freespc */
+ int newoffset; /* offset from current entry */
+ int offset; /* target block offset */
+ xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */
+ xfs_dir2_sf_t *sfp; /* shortform structure */
+ __be16 *tagp; /* end of data entry */
+ xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_name name;
+
+ trace_xfs_dir2_sf_to_block(args);
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ /*
+ * Bomb out if the shortform directory is way too short.
+ */
+ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ return XFS_ERROR(EIO);
+ }
+ ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+ /*
+ * Copy the directory into the stack buffer.
+ * Then pitch the incore inode data so we can make extents.
+ */
+
+ buf_len = dp->i_df.if_bytes;
+ buf = kmem_alloc(buf_len, KM_SLEEP);
+
+ memcpy(buf, sfp, buf_len);
+ xfs_idata_realloc(dp, -buf_len, XFS_DATA_FORK);
+ dp->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ /*
+ * Reset pointer - old sfp is gone.
+ */
+ sfp = (xfs_dir2_sf_t *)buf;
+ /*
+ * Add block 0 to the inode.
+ */
+ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
+ if (error) {
+ kmem_free(buf);
+ return error;
+ }
+ /*
+ * Initialize the data block.
+ */
+ error = xfs_dir2_data_init(args, blkno, &bp);
+ if (error) {
+ kmem_free(buf);
+ return error;
+ }
+ block = bp->data;
+ block->hdr.magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+ /*
+ * Compute size of block "tail" area.
+ */
+ i = (uint)sizeof(*btp) +
+ (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+ /*
+ * The whole thing is initialized to free by the init routine.
+ * Say we're using the leaf and tail area.
+ */
+ dup = (xfs_dir2_data_unused_t *)block->u;
+ needlog = needscan = 0;
+ xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
+ &needscan);
+ ASSERT(needscan == 0);
+ /*
+ * Fill in the tail.
+ */
+ btp = xfs_dir2_block_tail_p(mp, block);
+ btp->count = cpu_to_be32(sfp->hdr.count + 2); /* ., .. */
+ btp->stale = 0;
+ blp = xfs_dir2_block_leaf_p(btp);
+ endoffset = (uint)((char *)blp - (char *)block);
+ /*
+ * Remove the freespace, we'll manage it.
+ */
+ xfs_dir2_data_use_free(tp, bp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
+ be16_to_cpu(dup->length), &needlog, &needscan);
+ /*
+ * Create entry for .
+ */
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)block + XFS_DIR2_DATA_DOT_OFFSET);
+ dep->inumber = cpu_to_be64(dp->i_ino);
+ dep->namelen = 1;
+ dep->name[0] = '.';
+ tagp = xfs_dir2_data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)block);
+ xfs_dir2_data_log_entry(tp, bp, dep);
+ blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
+ blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ (char *)dep - (char *)block));
+ /*
+ * Create entry for ..
+ */
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET);
+ dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent));
+ dep->namelen = 2;
+ dep->name[0] = dep->name[1] = '.';
+ tagp = xfs_dir2_data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)block);
+ xfs_dir2_data_log_entry(tp, bp, dep);
+ blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
+ blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ (char *)dep - (char *)block));
+ offset = XFS_DIR2_DATA_FIRST_OFFSET;
+ /*
+ * Loop over existing entries, stuff them in.
+ */
+ if ((i = 0) == sfp->hdr.count)
+ sfep = NULL;
+ else
+ sfep = xfs_dir2_sf_firstentry(sfp);
+ /*
+ * Need to preserve the existing offset values in the sf directory.
+ * Insert holes (unused entries) where necessary.
+ */
+ while (offset < endoffset) {
+ /*
+ * sfep is null when we reach the end of the list.
+ */
+ if (sfep == NULL)
+ newoffset = endoffset;
+ else
+ newoffset = xfs_dir2_sf_get_offset(sfep);
+ /*
+ * There should be a hole here, make one.
+ */
+ if (offset < newoffset) {
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)block + offset);
+ dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ dup->length = cpu_to_be16(newoffset - offset);
+ *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
+ ((char *)dup - (char *)block));
+ xfs_dir2_data_log_unused(tp, bp, dup);
+ (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block,
+ dup, &dummy);
+ offset += be16_to_cpu(dup->length);
+ continue;
+ }
+ /*
+ * Copy a real entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset);
+ dep->inumber = cpu_to_be64(xfs_dir2_sf_get_inumber(sfp,
+ xfs_dir2_sf_inumberp(sfep)));
+ dep->namelen = sfep->namelen;
+ memcpy(dep->name, sfep->name, dep->namelen);
+ tagp = xfs_dir2_data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)block);
+ xfs_dir2_data_log_entry(tp, bp, dep);
+ name.name = sfep->name;
+ name.len = sfep->namelen;
+ blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
+ hashname(&name));
+ blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ (char *)dep - (char *)block));
+ offset = (int)((char *)(tagp + 1) - (char *)block);
+ if (++i == sfp->hdr.count)
+ sfep = NULL;
+ else
+ sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+ }
+ /* Done with the temporary buffer */
+ kmem_free(buf);
+ /*
+ * Sort the leaf entries by hash value.
+ */
+ xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort);
+ /*
+ * Log the leaf entry area and tail.
+ * Already logged the header in data_init, ignore needlog.
+ */
+ ASSERT(needscan == 0);
+ xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
+ xfs_dir2_block_log_tail(tp, bp);
+ xfs_dir2_data_check(dp, bp);
+ xfs_da_buf_done(bp);
+ return 0;
+}
diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h
new file mode 100644
index 00000000..10e68967
--- /dev/null
+++ b/fs/xfs/xfs_dir2_block.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DIR2_BLOCK_H__
+#define __XFS_DIR2_BLOCK_H__
+
+/*
+ * xfs_dir2_block.h
+ * Directory version 2, single block format structures
+ */
+
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_leaf_entry;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * The single block format is as follows:
+ * xfs_dir2_data_hdr_t structure
+ * xfs_dir2_data_entry_t and xfs_dir2_data_unused_t structures
+ * xfs_dir2_leaf_entry_t structures
+ * xfs_dir2_block_tail_t structure
+ */
+
+#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: for one block dirs */
+
+typedef struct xfs_dir2_block_tail {
+ __be32 count; /* count of leaf entries */
+ __be32 stale; /* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+
+/*
+ * Generic single-block structure, for xfs_db.
+ */
+typedef struct xfs_dir2_block {
+ xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_BLOCK_MAGIC */
+ xfs_dir2_data_union_t u[1];
+ xfs_dir2_leaf_entry_t leaf[1];
+ xfs_dir2_block_tail_t tail;
+} xfs_dir2_block_t;
+
+/*
+ * Pointer to the leaf header embedded in a data block (1-block format)
+ */
+static inline xfs_dir2_block_tail_t *
+xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block)
+{
+ return (((xfs_dir2_block_tail_t *)
+ ((char *)(block) + (mp)->m_dirblksize)) - 1);
+}
+
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp)
+{
+ return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
+}
+
+/*
+ * Function declarations.
+ */
+extern int xfs_dir2_block_addname(struct xfs_da_args *args);
+extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
+ xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_block_removename(struct xfs_da_args *args);
+extern int xfs_dir2_block_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
+ struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
+extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
+
+#endif /* __XFS_DIR2_BLOCK_H__ */
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
new file mode 100644
index 00000000..921595b8
--- /dev/null
+++ b/fs/xfs/xfs_dir2_data.c
@@ -0,0 +1,833 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_error.h"
+
+#ifdef DEBUG
+/*
+ * Check the consistency of the data block.
+ * The input can also be a block-format directory.
+ * Pop an assert if we find anything bad.
+ */
+void
+xfs_dir2_data_check(
+ xfs_inode_t *dp, /* incore inode pointer */
+ xfs_dabuf_t *bp) /* data block's buffer */
+{
+ xfs_dir2_dataptr_t addr; /* addr for leaf lookup */
+ xfs_dir2_data_free_t *bf; /* bestfree table */
+ xfs_dir2_block_tail_t *btp=NULL; /* block tail */
+ int count; /* count of entries found */
+ xfs_dir2_data_t *d; /* data block pointer */
+ xfs_dir2_data_entry_t *dep; /* data entry */
+ xfs_dir2_data_free_t *dfp; /* bestfree entry */
+ xfs_dir2_data_unused_t *dup; /* unused entry */
+ char *endp; /* end of useful data */
+ int freeseen; /* mask of bestfrees seen */
+ xfs_dahash_t hash; /* hash of current name */
+ int i; /* leaf index */
+ int lastfree; /* last entry was unused */
+ xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */
+ xfs_mount_t *mp; /* filesystem mount point */
+ char *p; /* current data position */
+ int stale; /* count of stale leaves */
+ struct xfs_name name;
+
+ mp = dp->i_mount;
+ d = bp->data;
+ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+ be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+ bf = d->hdr.bestfree;
+ p = (char *)d->u;
+ if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
+ btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d);
+ lep = xfs_dir2_block_leaf_p(btp);
+ endp = (char *)lep;
+ } else
+ endp = (char *)d + mp->m_dirblksize;
+ count = lastfree = freeseen = 0;
+ /*
+ * Account for zero bestfree entries.
+ */
+ if (!bf[0].length) {
+ ASSERT(!bf[0].offset);
+ freeseen |= 1 << 0;
+ }
+ if (!bf[1].length) {
+ ASSERT(!bf[1].offset);
+ freeseen |= 1 << 1;
+ }
+ if (!bf[2].length) {
+ ASSERT(!bf[2].offset);
+ freeseen |= 1 << 2;
+ }
+ ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length));
+ ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length));
+ /*
+ * Loop over the data/unused entries.
+ */
+ while (p < endp) {
+ dup = (xfs_dir2_data_unused_t *)p;
+ /*
+ * If it's unused, look for the space in the bestfree table.
+ * If we find it, account for that, else make sure it
+ * doesn't need to be there.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ ASSERT(lastfree == 0);
+ ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+ (char *)dup - (char *)d);
+ dfp = xfs_dir2_data_freefind(d, dup);
+ if (dfp) {
+ i = (int)(dfp - bf);
+ ASSERT((freeseen & (1 << i)) == 0);
+ freeseen |= 1 << i;
+ } else {
+ ASSERT(be16_to_cpu(dup->length) <=
+ be16_to_cpu(bf[2].length));
+ }
+ p += be16_to_cpu(dup->length);
+ lastfree = 1;
+ continue;
+ }
+ /*
+ * It's a real entry. Validate the fields.
+ * If this is a block directory then make sure it's
+ * in the leaf section of the block.
+ * The linear search is crude but this is DEBUG code.
+ */
+ dep = (xfs_dir2_data_entry_t *)p;
+ ASSERT(dep->namelen != 0);
+ ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
+ ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+ (char *)dep - (char *)d);
+ count++;
+ lastfree = 0;
+ if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
+ addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+ (xfs_dir2_data_aoff_t)
+ ((char *)dep - (char *)d));
+ name.name = dep->name;
+ name.len = dep->namelen;
+ hash = mp->m_dirnameops->hashname(&name);
+ for (i = 0; i < be32_to_cpu(btp->count); i++) {
+ if (be32_to_cpu(lep[i].address) == addr &&
+ be32_to_cpu(lep[i].hashval) == hash)
+ break;
+ }
+ ASSERT(i < be32_to_cpu(btp->count));
+ }
+ p += xfs_dir2_data_entsize(dep->namelen);
+ }
+ /*
+ * Need to have seen all the entries and all the bestfree slots.
+ */
+ ASSERT(freeseen == 7);
+ if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
+ for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
+ if (be32_to_cpu(lep[i].address) == XFS_DIR2_NULL_DATAPTR)
+ stale++;
+ if (i > 0)
+ ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval));
+ }
+ ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+ ASSERT(stale == be32_to_cpu(btp->stale));
+ }
+}
+#endif
+
+/*
+ * Given a data block and an unused entry from that block,
+ * return the bestfree entry if any that corresponds to it.
+ */
+xfs_dir2_data_free_t *
+xfs_dir2_data_freefind(
+ xfs_dir2_data_t *d, /* data block */
+ xfs_dir2_data_unused_t *dup) /* data unused entry */
+{
+ xfs_dir2_data_free_t *dfp; /* bestfree entry */
+ xfs_dir2_data_aoff_t off; /* offset value needed */
+#if defined(DEBUG) && defined(__KERNEL__)
+ int matched; /* matched the value */
+ int seenzero; /* saw a 0 bestfree entry */
+#endif
+
+ off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)d);
+#if defined(DEBUG) && defined(__KERNEL__)
+ /*
+ * Validate some consistency in the bestfree table.
+ * Check order, non-overlapping entries, and if we find the
+ * one we're looking for it has to be exact.
+ */
+ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+ be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+ for (dfp = &d->hdr.bestfree[0], seenzero = matched = 0;
+ dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
+ dfp++) {
+ if (!dfp->offset) {
+ ASSERT(!dfp->length);
+ seenzero = 1;
+ continue;
+ }
+ ASSERT(seenzero == 0);
+ if (be16_to_cpu(dfp->offset) == off) {
+ matched = 1;
+ ASSERT(dfp->length == dup->length);
+ } else if (off < be16_to_cpu(dfp->offset))
+ ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset));
+ else
+ ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
+ ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
+ if (dfp > &d->hdr.bestfree[0])
+ ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
+ }
+#endif
+ /*
+ * If this is smaller than the smallest bestfree entry,
+ * it can't be there since they're sorted.
+ */
+ if (be16_to_cpu(dup->length) <
+ be16_to_cpu(d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length))
+ return NULL;
+ /*
+ * Look at the three bestfree entries for our guy.
+ */
+ for (dfp = &d->hdr.bestfree[0];
+ dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
+ dfp++) {
+ if (!dfp->offset)
+ return NULL;
+ if (be16_to_cpu(dfp->offset) == off)
+ return dfp;
+ }
+ /*
+ * Didn't find it. This only happens if there are duplicate lengths.
+ */
+ return NULL;
+}
+
+/*
+ * Insert an unused-space entry into the bestfree table.
+ */
+xfs_dir2_data_free_t * /* entry inserted */
+xfs_dir2_data_freeinsert(
+ xfs_dir2_data_t *d, /* data block pointer */
+ xfs_dir2_data_unused_t *dup, /* unused space */
+ int *loghead) /* log the data header (out) */
+{
+ xfs_dir2_data_free_t *dfp; /* bestfree table pointer */
+ xfs_dir2_data_free_t new; /* new bestfree entry */
+
+#ifdef __KERNEL__
+ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+ be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+ dfp = d->hdr.bestfree;
+ new.length = dup->length;
+ new.offset = cpu_to_be16((char *)dup - (char *)d);
+ /*
+ * Insert at position 0, 1, or 2; or not at all.
+ */
+ if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) {
+ dfp[2] = dfp[1];
+ dfp[1] = dfp[0];
+ dfp[0] = new;
+ *loghead = 1;
+ return &dfp[0];
+ }
+ if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) {
+ dfp[2] = dfp[1];
+ dfp[1] = new;
+ *loghead = 1;
+ return &dfp[1];
+ }
+ if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) {
+ dfp[2] = new;
+ *loghead = 1;
+ return &dfp[2];
+ }
+ return NULL;
+}
+
+/*
+ * Remove a bestfree entry from the table.
+ */
+STATIC void
+xfs_dir2_data_freeremove(
+ xfs_dir2_data_t *d, /* data block pointer */
+ xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */
+ int *loghead) /* out: log data header */
+{
+#ifdef __KERNEL__
+ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+ be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+ /*
+ * It's the first entry, slide the next 2 up.
+ */
+ if (dfp == &d->hdr.bestfree[0]) {
+ d->hdr.bestfree[0] = d->hdr.bestfree[1];
+ d->hdr.bestfree[1] = d->hdr.bestfree[2];
+ }
+ /*
+ * It's the second entry, slide the 3rd entry up.
+ */
+ else if (dfp == &d->hdr.bestfree[1])
+ d->hdr.bestfree[1] = d->hdr.bestfree[2];
+ /*
+ * Must be the last entry.
+ */
+ else
+ ASSERT(dfp == &d->hdr.bestfree[2]);
+ /*
+ * Clear the 3rd entry, must be zero now.
+ */
+ d->hdr.bestfree[2].length = 0;
+ d->hdr.bestfree[2].offset = 0;
+ *loghead = 1;
+}
+
+/*
+ * Given a data block, reconstruct its bestfree map.
+ */
+void
+xfs_dir2_data_freescan(
+ xfs_mount_t *mp, /* filesystem mount point */
+ xfs_dir2_data_t *d, /* data block pointer */
+ int *loghead) /* out: log data header */
+{
+ xfs_dir2_block_tail_t *btp; /* block tail */
+ xfs_dir2_data_entry_t *dep; /* active data entry */
+ xfs_dir2_data_unused_t *dup; /* unused data entry */
+ char *endp; /* end of block's data */
+ char *p; /* current entry pointer */
+
+#ifdef __KERNEL__
+ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+ be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+ /*
+ * Start by clearing the table.
+ */
+ memset(d->hdr.bestfree, 0, sizeof(d->hdr.bestfree));
+ *loghead = 1;
+ /*
+ * Set up pointers.
+ */
+ p = (char *)d->u;
+ if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) {
+ btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d);
+ endp = (char *)xfs_dir2_block_leaf_p(btp);
+ } else
+ endp = (char *)d + mp->m_dirblksize;
+ /*
+ * Loop over the block's entries.
+ */
+ while (p < endp) {
+ dup = (xfs_dir2_data_unused_t *)p;
+ /*
+ * If it's a free entry, insert it.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ ASSERT((char *)dup - (char *)d ==
+ be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+ xfs_dir2_data_freeinsert(d, dup, loghead);
+ p += be16_to_cpu(dup->length);
+ }
+ /*
+ * For active entries, check their tags and skip them.
+ */
+ else {
+ dep = (xfs_dir2_data_entry_t *)p;
+ ASSERT((char *)dep - (char *)d ==
+ be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)));
+ p += xfs_dir2_data_entsize(dep->namelen);
+ }
+ }
+}
+
+/*
+ * Initialize a data block at the given block number in the directory.
+ * Give back the buffer for the created block.
+ */
+int /* error */
+xfs_dir2_data_init(
+ xfs_da_args_t *args, /* directory operation args */
+ xfs_dir2_db_t blkno, /* logical dir block number */
+ xfs_dabuf_t **bpp) /* output block buffer */
+{
+ xfs_dabuf_t *bp; /* block buffer */
+ xfs_dir2_data_t *d; /* pointer to block */
+ xfs_inode_t *dp; /* incore directory inode */
+ xfs_dir2_data_unused_t *dup; /* unused entry pointer */
+ int error; /* error return value */
+ int i; /* bestfree index */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_trans_t *tp; /* transaction pointer */
+ int t; /* temp */
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ tp = args->trans;
+ /*
+ * Get the buffer set up for the block.
+ */
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
+ XFS_DATA_FORK);
+ if (error) {
+ return error;
+ }
+ ASSERT(bp != NULL);
+ /*
+ * Initialize the header.
+ */
+ d = bp->data;
+ d->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+ d->hdr.bestfree[0].offset = cpu_to_be16(sizeof(d->hdr));
+ for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
+ d->hdr.bestfree[i].length = 0;
+ d->hdr.bestfree[i].offset = 0;
+ }
+ /*
+ * Set up an unused entry for the block's body.
+ */
+ dup = &d->u[0].unused;
+ dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+
+ t=mp->m_dirblksize - (uint)sizeof(d->hdr);
+ d->hdr.bestfree[0].length = cpu_to_be16(t);
+ dup->length = cpu_to_be16(t);
+ *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)d);
+ /*
+ * Log it and return it.
+ */
+ xfs_dir2_data_log_header(tp, bp);
+ xfs_dir2_data_log_unused(tp, bp, dup);
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Log an active data entry from the block.
+ */
+void
+xfs_dir2_data_log_entry(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_dabuf_t *bp, /* block buffer */
+ xfs_dir2_data_entry_t *dep) /* data entry pointer */
+{
+ xfs_dir2_data_t *d; /* data block pointer */
+
+ d = bp->data;
+ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+ be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+ xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d),
+ (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) -
+ (char *)d - 1));
+}
+
+/*
+ * Log a data block header.
+ */
+void
+xfs_dir2_data_log_header(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_dabuf_t *bp) /* block buffer */
+{
+ xfs_dir2_data_t *d; /* data block pointer */
+
+ d = bp->data;
+ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+ be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+ xfs_da_log_buf(tp, bp, (uint)((char *)&d->hdr - (char *)d),
+ (uint)(sizeof(d->hdr) - 1));
+}
+
+/*
+ * Log a data unused entry.
+ */
+void
+xfs_dir2_data_log_unused(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_dabuf_t *bp, /* block buffer */
+ xfs_dir2_data_unused_t *dup) /* data unused pointer */
+{
+ xfs_dir2_data_t *d; /* data block pointer */
+
+ d = bp->data;
+ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+ be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+ /*
+ * Log the first part of the unused entry.
+ */
+ xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)d),
+ (uint)((char *)&dup->length + sizeof(dup->length) -
+ 1 - (char *)d));
+ /*
+ * Log the end (tag) of the unused entry.
+ */
+ xfs_da_log_buf(tp, bp,
+ (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d),
+ (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)d +
+ sizeof(xfs_dir2_data_off_t) - 1));
+}
+
+/*
+ * Make a byte range in the data block unused.
+ * Its current contents are unimportant.
+ */
+void
+xfs_dir2_data_make_free(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_dabuf_t *bp, /* block buffer */
+ xfs_dir2_data_aoff_t offset, /* starting byte offset */
+ xfs_dir2_data_aoff_t len, /* length in bytes */
+ int *needlogp, /* out: log header */
+ int *needscanp) /* out: regen bestfree */
+{
+ xfs_dir2_data_t *d; /* data block pointer */
+ xfs_dir2_data_free_t *dfp; /* bestfree pointer */
+ char *endptr; /* end of data area */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int needscan; /* need to regen bestfree */
+ xfs_dir2_data_unused_t *newdup; /* new unused entry */
+ xfs_dir2_data_unused_t *postdup; /* unused entry after us */
+ xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
+
+ mp = tp->t_mountp;
+ d = bp->data;
+ /*
+ * Figure out where the end of the data area is.
+ */
+ if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC)
+ endptr = (char *)d + mp->m_dirblksize;
+ else {
+ xfs_dir2_block_tail_t *btp; /* block tail */
+
+ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+ btp = xfs_dir2_block_tail_p(mp, (xfs_dir2_block_t *)d);
+ endptr = (char *)xfs_dir2_block_leaf_p(btp);
+ }
+ /*
+ * If this isn't the start of the block, then back up to
+ * the previous entry and see if it's free.
+ */
+ if (offset > sizeof(d->hdr)) {
+ __be16 *tagp; /* tag just before us */
+
+ tagp = (__be16 *)((char *)d + offset) - 1;
+ prevdup = (xfs_dir2_data_unused_t *)((char *)d + be16_to_cpu(*tagp));
+ if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+ prevdup = NULL;
+ } else
+ prevdup = NULL;
+ /*
+ * If this isn't the end of the block, see if the entry after
+ * us is free.
+ */
+ if ((char *)d + offset + len < endptr) {
+ postdup =
+ (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+ if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
+ postdup = NULL;
+ } else
+ postdup = NULL;
+ ASSERT(*needscanp == 0);
+ needscan = 0;
+ /*
+ * Previous and following entries are both free,
+ * merge everything into a single free entry.
+ */
+ if (prevdup && postdup) {
+ xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */
+
+ /*
+ * See if prevdup and/or postdup are in bestfree table.
+ */
+ dfp = xfs_dir2_data_freefind(d, prevdup);
+ dfp2 = xfs_dir2_data_freefind(d, postdup);
+ /*
+ * We need a rescan unless there are exactly 2 free entries
+ * namely our two. Then we know what's happening, otherwise
+ * since the third bestfree is there, there might be more
+ * entries.
+ */
+ needscan = (d->hdr.bestfree[2].length != 0);
+ /*
+ * Fix up the new big freespace.
+ */
+ be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
+ *xfs_dir2_data_unused_tag_p(prevdup) =
+ cpu_to_be16((char *)prevdup - (char *)d);
+ xfs_dir2_data_log_unused(tp, bp, prevdup);
+ if (!needscan) {
+ /*
+ * Has to be the case that entries 0 and 1 are
+ * dfp and dfp2 (don't know which is which), and
+ * entry 2 is empty.
+ * Remove entry 1 first then entry 0.
+ */
+ ASSERT(dfp && dfp2);
+ if (dfp == &d->hdr.bestfree[1]) {
+ dfp = &d->hdr.bestfree[0];
+ ASSERT(dfp2 == dfp);
+ dfp2 = &d->hdr.bestfree[1];
+ }
+ xfs_dir2_data_freeremove(d, dfp2, needlogp);
+ xfs_dir2_data_freeremove(d, dfp, needlogp);
+ /*
+ * Now insert the new entry.
+ */
+ dfp = xfs_dir2_data_freeinsert(d, prevdup, needlogp);
+ ASSERT(dfp == &d->hdr.bestfree[0]);
+ ASSERT(dfp->length == prevdup->length);
+ ASSERT(!dfp[1].length);
+ ASSERT(!dfp[2].length);
+ }
+ }
+ /*
+ * The entry before us is free, merge with it.
+ */
+ else if (prevdup) {
+ dfp = xfs_dir2_data_freefind(d, prevdup);
+ be16_add_cpu(&prevdup->length, len);
+ *xfs_dir2_data_unused_tag_p(prevdup) =
+ cpu_to_be16((char *)prevdup - (char *)d);
+ xfs_dir2_data_log_unused(tp, bp, prevdup);
+ /*
+ * If the previous entry was in the table, the new entry
+ * is longer, so it will be in the table too. Remove
+ * the old one and add the new one.
+ */
+ if (dfp) {
+ xfs_dir2_data_freeremove(d, dfp, needlogp);
+ (void)xfs_dir2_data_freeinsert(d, prevdup, needlogp);
+ }
+ /*
+ * Otherwise we need a scan if the new entry is big enough.
+ */
+ else {
+ needscan = be16_to_cpu(prevdup->length) >
+ be16_to_cpu(d->hdr.bestfree[2].length);
+ }
+ }
+ /*
+ * The following entry is free, merge with it.
+ */
+ else if (postdup) {
+ dfp = xfs_dir2_data_freefind(d, postdup);
+ newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
+ newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)d);
+ xfs_dir2_data_log_unused(tp, bp, newdup);
+ /*
+ * If the following entry was in the table, the new entry
+ * is longer, so it will be in the table too. Remove
+ * the old one and add the new one.
+ */
+ if (dfp) {
+ xfs_dir2_data_freeremove(d, dfp, needlogp);
+ (void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+ }
+ /*
+ * Otherwise we need a scan if the new entry is big enough.
+ */
+ else {
+ needscan = be16_to_cpu(newdup->length) >
+ be16_to_cpu(d->hdr.bestfree[2].length);
+ }
+ }
+ /*
+ * Neither neighbor is free. Make a new entry.
+ */
+ else {
+ newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
+ newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup->length = cpu_to_be16(len);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)d);
+ xfs_dir2_data_log_unused(tp, bp, newdup);
+ (void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+ }
+ *needscanp = needscan;
+}
+
+/*
+ * Take a byte range out of an existing unused space and make it un-free.
+ */
+void
+xfs_dir2_data_use_free(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_dabuf_t *bp, /* data block buffer */
+ xfs_dir2_data_unused_t *dup, /* unused entry */
+ xfs_dir2_data_aoff_t offset, /* starting offset to use */
+ xfs_dir2_data_aoff_t len, /* length to use */
+ int *needlogp, /* out: need to log header */
+ int *needscanp) /* out: need regen bestfree */
+{
+ xfs_dir2_data_t *d; /* data block */
+ xfs_dir2_data_free_t *dfp; /* bestfree pointer */
+ int matchback; /* matches end of freespace */
+ int matchfront; /* matches start of freespace */
+ int needscan; /* need to regen bestfree */
+ xfs_dir2_data_unused_t *newdup; /* new unused entry */
+ xfs_dir2_data_unused_t *newdup2; /* another new unused entry */
+ int oldlen; /* old unused entry's length */
+
+ d = bp->data;
+ ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC ||
+ be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC);
+ ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
+ ASSERT(offset >= (char *)dup - (char *)d);
+ ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)d);
+ ASSERT((char *)dup - (char *)d == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
+ /*
+ * Look up the entry in the bestfree table.
+ */
+ dfp = xfs_dir2_data_freefind(d, dup);
+ oldlen = be16_to_cpu(dup->length);
+ ASSERT(dfp || oldlen <= be16_to_cpu(d->hdr.bestfree[2].length));
+ /*
+ * Check for alignment with front and back of the entry.
+ */
+ matchfront = (char *)dup - (char *)d == offset;
+ matchback = (char *)dup + oldlen - (char *)d == offset + len;
+ ASSERT(*needscanp == 0);
+ needscan = 0;
+ /*
+ * If we matched it exactly we just need to get rid of it from
+ * the bestfree table.
+ */
+ if (matchfront && matchback) {
+ if (dfp) {
+ needscan = (d->hdr.bestfree[2].offset != 0);
+ if (!needscan)
+ xfs_dir2_data_freeremove(d, dfp, needlogp);
+ }
+ }
+ /*
+ * We match the first part of the entry.
+ * Make a new entry with the remaining freespace.
+ */
+ else if (matchfront) {
+ newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+ newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup->length = cpu_to_be16(oldlen - len);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)d);
+ xfs_dir2_data_log_unused(tp, bp, newdup);
+ /*
+ * If it was in the table, remove it and add the new one.
+ */
+ if (dfp) {
+ xfs_dir2_data_freeremove(d, dfp, needlogp);
+ dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+ ASSERT(dfp != NULL);
+ ASSERT(dfp->length == newdup->length);
+ ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)d);
+ /*
+ * If we got inserted at the last slot,
+ * that means we don't know if there was a better
+ * choice for the last slot, or not. Rescan.
+ */
+ needscan = dfp == &d->hdr.bestfree[2];
+ }
+ }
+ /*
+ * We match the last part of the entry.
+ * Trim the allocated space off the tail of the entry.
+ */
+ else if (matchback) {
+ newdup = dup;
+ newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)d);
+ xfs_dir2_data_log_unused(tp, bp, newdup);
+ /*
+ * If it was in the table, remove it and add the new one.
+ */
+ if (dfp) {
+ xfs_dir2_data_freeremove(d, dfp, needlogp);
+ dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+ ASSERT(dfp != NULL);
+ ASSERT(dfp->length == newdup->length);
+ ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)d);
+ /*
+ * If we got inserted at the last slot,
+ * that means we don't know if there was a better
+ * choice for the last slot, or not. Rescan.
+ */
+ needscan = dfp == &d->hdr.bestfree[2];
+ }
+ }
+ /*
+ * Poking out the middle of an entry.
+ * Make two new entries.
+ */
+ else {
+ newdup = dup;
+ newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup);
+ *xfs_dir2_data_unused_tag_p(newdup) =
+ cpu_to_be16((char *)newdup - (char *)d);
+ xfs_dir2_data_log_unused(tp, bp, newdup);
+ newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+ newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
+ newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
+ *xfs_dir2_data_unused_tag_p(newdup2) =
+ cpu_to_be16((char *)newdup2 - (char *)d);
+ xfs_dir2_data_log_unused(tp, bp, newdup2);
+ /*
+ * If the old entry was in the table, we need to scan
+ * if the 3rd entry was valid, since these entries
+ * are smaller than the old one.
+ * If we don't need to scan that means there were 1 or 2
+ * entries in the table, and removing the old and adding
+ * the 2 new will work.
+ */
+ if (dfp) {
+ needscan = (d->hdr.bestfree[2].length != 0);
+ if (!needscan) {
+ xfs_dir2_data_freeremove(d, dfp, needlogp);
+ (void)xfs_dir2_data_freeinsert(d, newdup,
+ needlogp);
+ (void)xfs_dir2_data_freeinsert(d, newdup2,
+ needlogp);
+ }
+ }
+ }
+ *needscanp = needscan;
+}
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
new file mode 100644
index 00000000..efbc290c
--- /dev/null
+++ b/fs/xfs/xfs_dir2_data.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DIR2_DATA_H__
+#define __XFS_DIR2_DATA_H__
+
+/*
+ * Directory format 2, data block structures.
+ */
+
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Constants.
+ */
+#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: for multiblock dirs */
+#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */
+#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define XFS_DIR2_DATA_FREE_TAG 0xffff
+#define XFS_DIR2_DATA_FD_COUNT 3
+
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32GB.
+ */
+#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define XFS_DIR2_DATA_SPACE 0
+#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_DATA_FIRSTDB(mp) \
+ xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET)
+
+/*
+ * Offsets of . and .. in data space (always block 0)
+ */
+#define XFS_DIR2_DATA_DOT_OFFSET \
+ ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t))
+#define XFS_DIR2_DATA_DOTDOT_OFFSET \
+ (XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1))
+#define XFS_DIR2_DATA_FIRST_OFFSET \
+ (XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2))
+
+/*
+ * Structures.
+ */
+
+/*
+ * Describe a free area in the data block.
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+ __be16 offset; /* start of freespace */
+ __be16 length; /* length of freespace */
+} xfs_dir2_data_free_t;
+
+/*
+ * Header for the data blocks.
+ * Always at the beginning of a directory-sized block.
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+ __be32 magic; /* XFS_DIR2_DATA_MAGIC */
+ /* or XFS_DIR2_BLOCK_MAGIC */
+ xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+
+/*
+ * Active entry in a data block. Aligned to 8 bytes.
+ * Tag appears as the last 2 bytes.
+ */
+typedef struct xfs_dir2_data_entry {
+ __be64 inumber; /* inode number */
+ __u8 namelen; /* name length */
+ __u8 name[1]; /* name bytes, no null */
+ /* variable offset */
+ __be16 tag; /* starting offset of us */
+} xfs_dir2_data_entry_t;
+
+/*
+ * Unused entry in a data block. Aligned to 8 bytes.
+ * Tag appears as the last 2 bytes.
+ */
+typedef struct xfs_dir2_data_unused {
+ __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */
+ __be16 length; /* total free length */
+ /* variable offset */
+ __be16 tag; /* starting offset of us */
+} xfs_dir2_data_unused_t;
+
+typedef union {
+ xfs_dir2_data_entry_t entry;
+ xfs_dir2_data_unused_t unused;
+} xfs_dir2_data_union_t;
+
+/*
+ * Generic data block structure, for xfs_db.
+ */
+typedef struct xfs_dir2_data {
+ xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_DATA_MAGIC */
+ xfs_dir2_data_union_t u[1];
+} xfs_dir2_data_t;
+
+/*
+ * Macros.
+ */
+
+/*
+ * Size of a data entry.
+ */
+static inline int xfs_dir2_data_entsize(int n)
+{
+ return (int)roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \
+ (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN);
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep)
+{
+ return (__be16 *)((char *)dep +
+ xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+/*
+ * Pointer to a freespace's tag word.
+ */
+static inline __be16 *
+xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup)
+{
+ return (__be16 *)((char *)dup +
+ be16_to_cpu(dup->length) - sizeof(__be16));
+}
+
+/*
+ * Function declarations.
+ */
+#ifdef DEBUG
+extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp);
+#else
+#define xfs_dir2_data_check(dp,bp)
+#endif
+extern xfs_dir2_data_free_t *xfs_dir2_data_freefind(xfs_dir2_data_t *d,
+ xfs_dir2_data_unused_t *dup);
+extern xfs_dir2_data_free_t *xfs_dir2_data_freeinsert(xfs_dir2_data_t *d,
+ xfs_dir2_data_unused_t *dup, int *loghead);
+extern void xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d,
+ int *loghead);
+extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+ struct xfs_dabuf **bpp);
+extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp,
+ xfs_dir2_data_entry_t *dep);
+extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
+ struct xfs_dabuf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp,
+ xfs_dir2_data_unused_t *dup);
+extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+ xfs_dir2_data_aoff_t offset,
+ xfs_dir2_data_aoff_t len, int *needlogp,
+ int *needscanp);
+extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+ xfs_dir2_data_unused_t *dup,
+ xfs_dir2_data_aoff_t offset,
+ xfs_dir2_data_aoff_t len, int *needlogp,
+ int *needscanp);
+
+#endif /* __XFS_DIR2_DATA_H__ */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
new file mode 100644
index 00000000..ae891223
--- /dev/null
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -0,0 +1,1881 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * Local function declarations.
+ */
+#ifdef DEBUG
+static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+#else
+#define xfs_dir2_leaf_check(dp, bp)
+#endif
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp,
+ int *indexp, xfs_dabuf_t **dbpp);
+static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
+ int first, int last);
+static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp);
+
+
+/*
+ * Convert a block form directory to a leaf form directory.
+ */
+int /* error */
+xfs_dir2_block_to_leaf(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dabuf_t *dbp) /* input block's buffer */
+{
+ __be16 *bestsp; /* leaf's bestsp entries */
+ xfs_dablk_t blkno; /* leaf block's bno */
+ xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_leaf_entry_t *blp; /* block's leaf entries */
+ xfs_dir2_block_tail_t *btp; /* block's tail */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ xfs_dabuf_t *lbp; /* leaf block's buffer */
+ xfs_dir2_db_t ldb; /* leaf block's bno */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int needlog; /* need to log block header */
+ int needscan; /* need to rescan bestfree */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ trace_xfs_dir2_block_to_leaf(args);
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ tp = args->trans;
+ /*
+ * Add the leaf block to the inode.
+ * This interface will only put blocks in the leaf/node range.
+ * Since that's empty now, we'll get the root (block 0 in range).
+ */
+ if ((error = xfs_da_grow_inode(args, &blkno))) {
+ return error;
+ }
+ ldb = xfs_dir2_da_to_db(mp, blkno);
+ ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp));
+ /*
+ * Initialize the leaf block, get a buffer for it.
+ */
+ if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) {
+ return error;
+ }
+ ASSERT(lbp != NULL);
+ leaf = lbp->data;
+ block = dbp->data;
+ xfs_dir2_data_check(dp, dbp);
+ btp = xfs_dir2_block_tail_p(mp, block);
+ blp = xfs_dir2_block_leaf_p(btp);
+ /*
+ * Set the counts in the leaf header.
+ */
+ leaf->hdr.count = cpu_to_be16(be32_to_cpu(btp->count));
+ leaf->hdr.stale = cpu_to_be16(be32_to_cpu(btp->stale));
+ /*
+ * Could compact these but I think we always do the conversion
+ * after squeezing out stale entries.
+ */
+ memcpy(leaf->ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir2_leaf_log_ents(tp, lbp, 0, be16_to_cpu(leaf->hdr.count) - 1);
+ needscan = 0;
+ needlog = 1;
+ /*
+ * Make the space formerly occupied by the leaf entries and block
+ * tail be free.
+ */
+ xfs_dir2_data_make_free(tp, dbp,
+ (xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
+ (xfs_dir2_data_aoff_t)((char *)block + mp->m_dirblksize -
+ (char *)blp),
+ &needlog, &needscan);
+ /*
+ * Fix up the block header, make it a data block.
+ */
+ block->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+ if (needscan)
+ xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog);
+ /*
+ * Set up leaf tail and bests table.
+ */
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp->bestcount = cpu_to_be32(1);
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ bestsp[0] = block->hdr.bestfree[0].length;
+ /*
+ * Log the data header and leaf bests table.
+ */
+ if (needlog)
+ xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_leaf_check(dp, lbp);
+ xfs_dir2_data_check(dp, dbp);
+ xfs_dir2_leaf_log_bests(tp, lbp, 0, 0);
+ xfs_da_buf_done(lbp);
+ return 0;
+}
+
+/*
+ * Add an entry to a leaf form directory.
+ */
+int /* error */
+xfs_dir2_leaf_addname(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ __be16 *bestsp; /* freespace table in leaf */
+ int compact; /* need to compact leaves */
+ xfs_dir2_data_t *data; /* data block structure */
+ xfs_dabuf_t *dbp; /* data block buffer */
+ xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ xfs_dir2_data_unused_t *dup; /* data unused entry */
+ int error; /* error return value */
+ int grown; /* allocated new data block */
+ int highstale; /* index of next stale leaf */
+ int i; /* temporary, index */
+ int index; /* leaf table position */
+ xfs_dabuf_t *lbp; /* leaf's buffer */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ int length; /* length of new entry */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
+ int lfloglow; /* low leaf logging index */
+ int lfloghigh; /* high leaf logging index */
+ int lowstale; /* index of prev stale leaf */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int needbytes; /* leaf block bytes needed */
+ int needlog; /* need to log data header */
+ int needscan; /* need to rescan data free */
+ __be16 *tagp; /* end of data entry */
+ xfs_trans_t *tp; /* transaction pointer */
+ xfs_dir2_db_t use_block; /* data block number */
+
+ trace_xfs_dir2_leaf_addname(args);
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ /*
+ * Read the leaf block.
+ */
+ error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+ XFS_DATA_FORK);
+ if (error) {
+ return error;
+ }
+ ASSERT(lbp != NULL);
+ /*
+ * Look up the entry by hash value and name.
+ * We know it's not there, our caller has already done a lookup.
+ * So the index is of the entry to insert in front of.
+ * But if there are dup hash values the index is of the first of those.
+ */
+ index = xfs_dir2_leaf_search_hash(args, lbp);
+ leaf = lbp->data;
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ length = xfs_dir2_data_entsize(args->namelen);
+ /*
+ * See if there are any entries with the same hash value
+ * and space in their block for the new entry.
+ * This is good because it puts multiple same-hash value entries
+ * in a data block, improving the lookup of those entries.
+ */
+ for (use_block = -1, lep = &leaf->ents[index];
+ index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval;
+ index++, lep++) {
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ ASSERT(i < be32_to_cpu(ltp->bestcount));
+ ASSERT(be16_to_cpu(bestsp[i]) != NULLDATAOFF);
+ if (be16_to_cpu(bestsp[i]) >= length) {
+ use_block = i;
+ break;
+ }
+ }
+ /*
+ * Didn't find a block yet, linear search all the data blocks.
+ */
+ if (use_block == -1) {
+ for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) {
+ /*
+ * Remember a block we see that's missing.
+ */
+ if (be16_to_cpu(bestsp[i]) == NULLDATAOFF && use_block == -1)
+ use_block = i;
+ else if (be16_to_cpu(bestsp[i]) >= length) {
+ use_block = i;
+ break;
+ }
+ }
+ }
+ /*
+ * How many bytes do we need in the leaf block?
+ */
+ needbytes =
+ (leaf->hdr.stale ? 0 : (uint)sizeof(leaf->ents[0])) +
+ (use_block != -1 ? 0 : (uint)sizeof(leaf->bests[0]));
+ /*
+ * Now kill use_block if it refers to a missing block, so we
+ * can use it as an indication of allocation needed.
+ */
+ if (use_block != -1 && be16_to_cpu(bestsp[use_block]) == NULLDATAOFF)
+ use_block = -1;
+ /*
+ * If we don't have enough free bytes but we can make enough
+ * by compacting out stale entries, we'll do that.
+ */
+ if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <
+ needbytes && be16_to_cpu(leaf->hdr.stale) > 1) {
+ compact = 1;
+ }
+ /*
+ * Otherwise if we don't have enough free bytes we need to
+ * convert to node form.
+ */
+ else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(
+ leaf->hdr.count)] < needbytes) {
+ /*
+ * Just checking or no space reservation, give up.
+ */
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+ args->total == 0) {
+ xfs_da_brelse(tp, lbp);
+ return XFS_ERROR(ENOSPC);
+ }
+ /*
+ * Convert to node form.
+ */
+ error = xfs_dir2_leaf_to_node(args, lbp);
+ xfs_da_buf_done(lbp);
+ if (error)
+ return error;
+ /*
+ * Then add the new entry.
+ */
+ return xfs_dir2_node_addname(args);
+ }
+ /*
+ * Otherwise it will fit without compaction.
+ */
+ else
+ compact = 0;
+ /*
+ * If just checking, then it will fit unless we needed to allocate
+ * a new data block.
+ */
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+ xfs_da_brelse(tp, lbp);
+ return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
+ }
+ /*
+ * If no allocations are allowed, return now before we've
+ * changed anything.
+ */
+ if (args->total == 0 && use_block == -1) {
+ xfs_da_brelse(tp, lbp);
+ return XFS_ERROR(ENOSPC);
+ }
+ /*
+ * Need to compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and we'll shift that one to our insertion
+ * point later.
+ */
+ if (compact) {
+ xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale,
+ &lfloglow, &lfloghigh);
+ }
+ /*
+ * There are stale entries, so we'll need log-low and log-high
+ * impossibly bad values later.
+ */
+ else if (be16_to_cpu(leaf->hdr.stale)) {
+ lfloglow = be16_to_cpu(leaf->hdr.count);
+ lfloghigh = -1;
+ }
+ /*
+ * If there was no data block space found, we need to allocate
+ * a new one.
+ */
+ if (use_block == -1) {
+ /*
+ * Add the new data block.
+ */
+ if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+ &use_block))) {
+ xfs_da_brelse(tp, lbp);
+ return error;
+ }
+ /*
+ * Initialize the block.
+ */
+ if ((error = xfs_dir2_data_init(args, use_block, &dbp))) {
+ xfs_da_brelse(tp, lbp);
+ return error;
+ }
+ /*
+ * If we're adding a new data block on the end we need to
+ * extend the bests table. Copy it up one entry.
+ */
+ if (use_block >= be32_to_cpu(ltp->bestcount)) {
+ bestsp--;
+ memmove(&bestsp[0], &bestsp[1],
+ be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
+ be32_add_cpu(&ltp->bestcount, 1);
+ xfs_dir2_leaf_log_tail(tp, lbp);
+ xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ }
+ /*
+ * If we're filling in a previously empty block just log it.
+ */
+ else
+ xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+ data = dbp->data;
+ bestsp[use_block] = data->hdr.bestfree[0].length;
+ grown = 1;
+ }
+ /*
+ * Already had space in some data block.
+ * Just read that one in.
+ */
+ else {
+ if ((error =
+ xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block),
+ -1, &dbp, XFS_DATA_FORK))) {
+ xfs_da_brelse(tp, lbp);
+ return error;
+ }
+ data = dbp->data;
+ grown = 0;
+ }
+ xfs_dir2_data_check(dp, dbp);
+ /*
+ * Point to the biggest freespace in our data block.
+ */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)data + be16_to_cpu(data->hdr.bestfree[0].offset));
+ ASSERT(be16_to_cpu(dup->length) >= length);
+ needscan = needlog = 0;
+ /*
+ * Mark the initial part of our freespace in use for the new entry.
+ */
+ xfs_dir2_data_use_free(tp, dbp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+ &needlog, &needscan);
+ /*
+ * Initialize our new entry (at last).
+ */
+ dep = (xfs_dir2_data_entry_t *)dup;
+ dep->inumber = cpu_to_be64(args->inumber);
+ dep->namelen = args->namelen;
+ memcpy(dep->name, args->name, dep->namelen);
+ tagp = xfs_dir2_data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)data);
+ /*
+ * Need to scan fix up the bestfree table.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(mp, data, &needlog);
+ /*
+ * Need to log the data block's header.
+ */
+ if (needlog)
+ xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_log_entry(tp, dbp, dep);
+ /*
+ * If the bests table needs to be changed, do it.
+ * Log the change unless we've already done that.
+ */
+ if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(data->hdr.bestfree[0].length)) {
+ bestsp[use_block] = data->hdr.bestfree[0].length;
+ if (!grown)
+ xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+ }
+ /*
+ * Now we need to make room to insert the leaf entry.
+ * If there are no stale entries, we just insert a hole at index.
+ */
+ if (!leaf->hdr.stale) {
+ /*
+ * lep is still good as the index leaf entry.
+ */
+ if (index < be16_to_cpu(leaf->hdr.count))
+ memmove(lep + 1, lep,
+ (be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep));
+ /*
+ * Record low and high logging indices for the leaf.
+ */
+ lfloglow = index;
+ lfloghigh = be16_to_cpu(leaf->hdr.count);
+ be16_add_cpu(&leaf->hdr.count, 1);
+ }
+ /*
+ * There are stale entries.
+ * We will use one of them for the new entry.
+ * It's probably not at the right location, so we'll have to
+ * shift some up or down first.
+ */
+ else {
+ /*
+ * If we didn't compact before, we need to find the nearest
+ * stale entries before and after our insertion point.
+ */
+ if (compact == 0) {
+ /*
+ * Find the first stale entry before the insertion
+ * point, if any.
+ */
+ for (lowstale = index - 1;
+ lowstale >= 0 &&
+ be32_to_cpu(leaf->ents[lowstale].address) !=
+ XFS_DIR2_NULL_DATAPTR;
+ lowstale--)
+ continue;
+ /*
+ * Find the next stale entry at or after the insertion
+ * point, if any. Stop if we go so far that the
+ * lowstale entry would be better.
+ */
+ for (highstale = index;
+ highstale < be16_to_cpu(leaf->hdr.count) &&
+ be32_to_cpu(leaf->ents[highstale].address) !=
+ XFS_DIR2_NULL_DATAPTR &&
+ (lowstale < 0 ||
+ index - lowstale - 1 >= highstale - index);
+ highstale++)
+ continue;
+ }
+ /*
+ * If the low one is better, use it.
+ */
+ if (lowstale >= 0 &&
+ (highstale == be16_to_cpu(leaf->hdr.count) ||
+ index - lowstale - 1 < highstale - index)) {
+ ASSERT(index - lowstale - 1 >= 0);
+ ASSERT(be32_to_cpu(leaf->ents[lowstale].address) ==
+ XFS_DIR2_NULL_DATAPTR);
+ /*
+ * Copy entries up to cover the stale entry
+ * and make room for the new entry.
+ */
+ if (index - lowstale - 1 > 0)
+ memmove(&leaf->ents[lowstale],
+ &leaf->ents[lowstale + 1],
+ (index - lowstale - 1) * sizeof(*lep));
+ lep = &leaf->ents[index - 1];
+ lfloglow = MIN(lowstale, lfloglow);
+ lfloghigh = MAX(index - 1, lfloghigh);
+ }
+ /*
+ * The high one is better, so use that one.
+ */
+ else {
+ ASSERT(highstale - index >= 0);
+ ASSERT(be32_to_cpu(leaf->ents[highstale].address) ==
+ XFS_DIR2_NULL_DATAPTR);
+ /*
+ * Copy entries down to cover the stale entry
+ * and make room for the new entry.
+ */
+ if (highstale - index > 0)
+ memmove(&leaf->ents[index + 1],
+ &leaf->ents[index],
+ (highstale - index) * sizeof(*lep));
+ lep = &leaf->ents[index];
+ lfloglow = MIN(index, lfloglow);
+ lfloghigh = MAX(highstale, lfloghigh);
+ }
+ be16_add_cpu(&leaf->hdr.stale, -1);
+ }
+ /*
+ * Fill in the new leaf entry.
+ */
+ lep->hashval = cpu_to_be32(args->hashval);
+ lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block,
+ be16_to_cpu(*tagp)));
+ /*
+ * Log the leaf fields and give up the buffers.
+ */
+ xfs_dir2_leaf_log_header(tp, lbp);
+ xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
+ xfs_dir2_leaf_check(dp, lbp);
+ xfs_da_buf_done(lbp);
+ xfs_dir2_data_check(dp, dbp);
+ xfs_da_buf_done(dbp);
+ return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+STATIC void
+xfs_dir2_leaf_check(
+ xfs_inode_t *dp, /* incore directory inode */
+ xfs_dabuf_t *bp) /* leaf's buffer */
+{
+ int i; /* leaf index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int stale; /* count of stale leaves */
+
+ leaf = bp->data;
+ mp = dp->i_mount;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC);
+ /*
+ * This value is not restrictive enough.
+ * Should factor in the size of the bests table as well.
+ * We can deduce a value for that from di_size.
+ */
+ ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ /*
+ * Leaves and bests don't overlap.
+ */
+ ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <=
+ (char *)xfs_dir2_leaf_bests_p(ltp));
+ /*
+ * Check hash value order, count stale entries.
+ */
+ for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) {
+ if (i + 1 < be16_to_cpu(leaf->hdr.count))
+ ASSERT(be32_to_cpu(leaf->ents[i].hashval) <=
+ be32_to_cpu(leaf->ents[i + 1].hashval));
+ if (be32_to_cpu(leaf->ents[i].address) == XFS_DIR2_NULL_DATAPTR)
+ stale++;
+ }
+ ASSERT(be16_to_cpu(leaf->hdr.stale) == stale);
+}
+#endif /* DEBUG */
+
+/*
+ * Compact out any stale entries in the leaf.
+ * Log the header and changed leaf entries, if any.
+ */
+void
+xfs_dir2_leaf_compact(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dabuf_t *bp) /* leaf buffer */
+{
+ int from; /* source leaf index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ int loglow; /* first leaf entry to log */
+ int to; /* target leaf index */
+
+ leaf = bp->data;
+ if (!leaf->hdr.stale) {
+ return;
+ }
+ /*
+ * Compress out the stale entries in place.
+ */
+ for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) {
+ if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Only actually copy the entries that are different.
+ */
+ if (from > to) {
+ if (loglow == -1)
+ loglow = to;
+ leaf->ents[to] = leaf->ents[from];
+ }
+ to++;
+ }
+ /*
+ * Update and log the header, log the leaf entries.
+ */
+ ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to);
+ be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale)));
+ leaf->hdr.stale = 0;
+ xfs_dir2_leaf_log_header(args->trans, bp);
+ if (loglow != -1)
+ xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1);
+}
+
+/*
+ * Compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and the caller will shift that one to our insertion
+ * point later.
+ * Return new insertion index, where the remaining stale entry is,
+ * and leaf logging indices.
+ */
+void
+xfs_dir2_leaf_compact_x1(
+ xfs_dabuf_t *bp, /* leaf buffer */
+ int *indexp, /* insertion index */
+ int *lowstalep, /* out: stale entry before us */
+ int *highstalep, /* out: stale entry after us */
+ int *lowlogp, /* out: low log index */
+ int *highlogp) /* out: high log index */
+{
+ int from; /* source copy index */
+ int highstale; /* stale entry at/after index */
+ int index; /* insertion index */
+ int keepstale; /* source index of kept stale */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ int lowstale; /* stale entry before index */
+ int newindex=0; /* new insertion index */
+ int to; /* destination copy index */
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.stale) > 1);
+ index = *indexp;
+ /*
+ * Find the first stale entry before our index, if any.
+ */
+ for (lowstale = index - 1;
+ lowstale >= 0 &&
+ be32_to_cpu(leaf->ents[lowstale].address) != XFS_DIR2_NULL_DATAPTR;
+ lowstale--)
+ continue;
+ /*
+ * Find the first stale entry at or after our index, if any.
+ * Stop if the answer would be worse than lowstale.
+ */
+ for (highstale = index;
+ highstale < be16_to_cpu(leaf->hdr.count) &&
+ be32_to_cpu(leaf->ents[highstale].address) != XFS_DIR2_NULL_DATAPTR &&
+ (lowstale < 0 || index - lowstale > highstale - index);
+ highstale++)
+ continue;
+ /*
+ * Pick the better of lowstale and highstale.
+ */
+ if (lowstale >= 0 &&
+ (highstale == be16_to_cpu(leaf->hdr.count) ||
+ index - lowstale <= highstale - index))
+ keepstale = lowstale;
+ else
+ keepstale = highstale;
+ /*
+ * Copy the entries in place, removing all the stale entries
+ * except keepstale.
+ */
+ for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) {
+ /*
+ * Notice the new value of index.
+ */
+ if (index == from)
+ newindex = to;
+ if (from != keepstale &&
+ be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) {
+ if (from == to)
+ *lowlogp = to;
+ continue;
+ }
+ /*
+ * Record the new keepstale value for the insertion.
+ */
+ if (from == keepstale)
+ lowstale = highstale = to;
+ /*
+ * Copy only the entries that have moved.
+ */
+ if (from > to)
+ leaf->ents[to] = leaf->ents[from];
+ to++;
+ }
+ ASSERT(from > to);
+ /*
+ * If the insertion point was past the last entry,
+ * set the new insertion point accordingly.
+ */
+ if (index == from)
+ newindex = to;
+ *indexp = newindex;
+ /*
+ * Adjust the leaf header values.
+ */
+ be16_add_cpu(&leaf->hdr.count, -(from - to));
+ leaf->hdr.stale = cpu_to_be16(1);
+ /*
+ * Remember the low/high stale value only in the "right"
+ * direction.
+ */
+ if (lowstale >= newindex)
+ lowstale = -1;
+ else
+ highstale = be16_to_cpu(leaf->hdr.count);
+ *highlogp = be16_to_cpu(leaf->hdr.count) - 1;
+ *lowstalep = lowstale;
+ *highstalep = highstale;
+}
+
+/*
+ * Getdents (readdir) for leaf and node directories.
+ * This reads the data blocks only, so is the same for both forms.
+ */
+int /* error */
+xfs_dir2_leaf_getdents(
+ xfs_inode_t *dp, /* incore directory inode */
+ void *dirent,
+ size_t bufsize,
+ xfs_off_t *offset,
+ filldir_t filldir)
+{
+ xfs_dabuf_t *bp; /* data block buffer */
+ int byteoff; /* offset in current block */
+ xfs_dir2_db_t curdb; /* db for current block */
+ xfs_dir2_off_t curoff; /* current overall offset */
+ xfs_dir2_data_t *data; /* data block structure */
+ xfs_dir2_data_entry_t *dep; /* data entry */
+ xfs_dir2_data_unused_t *dup; /* unused entry */
+ int error = 0; /* error return value */
+ int i; /* temporary loop index */
+ int j; /* temporary loop index */
+ int length; /* temporary length value */
+ xfs_bmbt_irec_t *map; /* map vector for blocks */
+ xfs_extlen_t map_blocks; /* number of fsbs in map */
+ xfs_dablk_t map_off; /* last mapped file offset */
+ int map_size; /* total entries in *map */
+ int map_valid; /* valid entries in *map */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_off_t newoff; /* new curoff after new blk */
+ int nmap; /* mappings to ask xfs_bmapi */
+ char *ptr = NULL; /* pointer to current data */
+ int ra_current; /* number of read-ahead blks */
+ int ra_index; /* *map index for read-ahead */
+ int ra_offset; /* map entry offset for ra */
+ int ra_want; /* readahead count wanted */
+
+ /*
+ * If the offset is at or past the largest allowed value,
+ * give up right away.
+ */
+ if (*offset >= XFS_DIR2_MAX_DATAPTR)
+ return 0;
+
+ mp = dp->i_mount;
+
+ /*
+ * Set up to bmap a number of blocks based on the caller's
+ * buffer size, the directory block size, and the filesystem
+ * block size.
+ */
+ map_size = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize);
+ map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);
+ map_valid = ra_index = ra_offset = ra_current = map_blocks = 0;
+ bp = NULL;
+
+ /*
+ * Inside the loop we keep the main offset value as a byte offset
+ * in the directory file.
+ */
+ curoff = xfs_dir2_dataptr_to_byte(mp, *offset);
+
+ /*
+ * Force this conversion through db so we truncate the offset
+ * down to get the start of the data block.
+ */
+ map_off = xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, curoff));
+ /*
+ * Loop over directory entries until we reach the end offset.
+ * Get more blocks and readahead as necessary.
+ */
+ while (curoff < XFS_DIR2_LEAF_OFFSET) {
+ /*
+ * If we have no buffer, or we're off the end of the
+ * current buffer, need to get another one.
+ */
+ if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) {
+ /*
+ * If we have a buffer, we need to release it and
+ * take it out of the mapping.
+ */
+ if (bp) {
+ xfs_da_brelse(NULL, bp);
+ bp = NULL;
+ map_blocks -= mp->m_dirblkfsbs;
+ /*
+ * Loop to get rid of the extents for the
+ * directory block.
+ */
+ for (i = mp->m_dirblkfsbs; i > 0; ) {
+ j = MIN((int)map->br_blockcount, i);
+ map->br_blockcount -= j;
+ map->br_startblock += j;
+ map->br_startoff += j;
+ /*
+ * If mapping is done, pitch it from
+ * the table.
+ */
+ if (!map->br_blockcount && --map_valid)
+ memmove(&map[0], &map[1],
+ sizeof(map[0]) *
+ map_valid);
+ i -= j;
+ }
+ }
+ /*
+ * Recalculate the readahead blocks wanted.
+ */
+ ra_want = howmany(bufsize + mp->m_dirblksize,
+ mp->m_sb.sb_blocksize) - 1;
+ ASSERT(ra_want >= 0);
+
+ /*
+ * If we don't have as many as we want, and we haven't
+ * run out of data blocks, get some more mappings.
+ */
+ if (1 + ra_want > map_blocks &&
+ map_off <
+ xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
+ /*
+ * Get more bmaps, fill in after the ones
+ * we already have in the table.
+ */
+ nmap = map_size - map_valid;
+ error = xfs_bmapi(NULL, dp,
+ map_off,
+ xfs_dir2_byte_to_da(mp,
+ XFS_DIR2_LEAF_OFFSET) - map_off,
+ XFS_BMAPI_METADATA, NULL, 0,
+ &map[map_valid], &nmap, NULL);
+ /*
+ * Don't know if we should ignore this or
+ * try to return an error.
+ * The trouble with returning errors
+ * is that readdir will just stop without
+ * actually passing the error through.
+ */
+ if (error)
+ break; /* XXX */
+ /*
+ * If we got all the mappings we asked for,
+ * set the final map offset based on the
+ * last bmap value received.
+ * Otherwise, we've reached the end.
+ */
+ if (nmap == map_size - map_valid)
+ map_off =
+ map[map_valid + nmap - 1].br_startoff +
+ map[map_valid + nmap - 1].br_blockcount;
+ else
+ map_off =
+ xfs_dir2_byte_to_da(mp,
+ XFS_DIR2_LEAF_OFFSET);
+ /*
+ * Look for holes in the mapping, and
+ * eliminate them. Count up the valid blocks.
+ */
+ for (i = map_valid; i < map_valid + nmap; ) {
+ if (map[i].br_startblock ==
+ HOLESTARTBLOCK) {
+ nmap--;
+ length = map_valid + nmap - i;
+ if (length)
+ memmove(&map[i],
+ &map[i + 1],
+ sizeof(map[i]) *
+ length);
+ } else {
+ map_blocks +=
+ map[i].br_blockcount;
+ i++;
+ }
+ }
+ map_valid += nmap;
+ }
+ /*
+ * No valid mappings, so no more data blocks.
+ */
+ if (!map_valid) {
+ curoff = xfs_dir2_da_to_byte(mp, map_off);
+ break;
+ }
+ /*
+ * Read the directory block starting at the first
+ * mapping.
+ */
+ curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
+ error = xfs_da_read_buf(NULL, dp, map->br_startoff,
+ map->br_blockcount >= mp->m_dirblkfsbs ?
+ XFS_FSB_TO_DADDR(mp, map->br_startblock) :
+ -1,
+ &bp, XFS_DATA_FORK);
+ /*
+ * Should just skip over the data block instead
+ * of giving up.
+ */
+ if (error)
+ break; /* XXX */
+ /*
+ * Adjust the current amount of read-ahead: we just
+ * read a block that was previously ra.
+ */
+ if (ra_current)
+ ra_current -= mp->m_dirblkfsbs;
+ /*
+ * Do we need more readahead?
+ */
+ for (ra_index = ra_offset = i = 0;
+ ra_want > ra_current && i < map_blocks;
+ i += mp->m_dirblkfsbs) {
+ ASSERT(ra_index < map_valid);
+ /*
+ * Read-ahead a contiguous directory block.
+ */
+ if (i > ra_current &&
+ map[ra_index].br_blockcount >=
+ mp->m_dirblkfsbs) {
+ xfs_buf_readahead(mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp,
+ map[ra_index].br_startblock +
+ ra_offset),
+ (int)BTOBB(mp->m_dirblksize));
+ ra_current = i;
+ }
+ /*
+ * Read-ahead a non-contiguous directory block.
+ * This doesn't use our mapping, but this
+ * is a very rare case.
+ */
+ else if (i > ra_current) {
+ (void)xfs_da_reada_buf(NULL, dp,
+ map[ra_index].br_startoff +
+ ra_offset, XFS_DATA_FORK);
+ ra_current = i;
+ }
+ /*
+ * Advance offset through the mapping table.
+ */
+ for (j = 0; j < mp->m_dirblkfsbs; j++) {
+ /*
+ * The rest of this extent but not
+ * more than a dir block.
+ */
+ length = MIN(mp->m_dirblkfsbs,
+ (int)(map[ra_index].br_blockcount -
+ ra_offset));
+ j += length;
+ ra_offset += length;
+ /*
+ * Advance to the next mapping if
+ * this one is used up.
+ */
+ if (ra_offset ==
+ map[ra_index].br_blockcount) {
+ ra_offset = 0;
+ ra_index++;
+ }
+ }
+ }
+ /*
+ * Having done a read, we need to set a new offset.
+ */
+ newoff = xfs_dir2_db_off_to_byte(mp, curdb, 0);
+ /*
+ * Start of the current block.
+ */
+ if (curoff < newoff)
+ curoff = newoff;
+ /*
+ * Make sure we're in the right block.
+ */
+ else if (curoff > newoff)
+ ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
+ curdb);
+ data = bp->data;
+ xfs_dir2_data_check(dp, bp);
+ /*
+ * Find our position in the block.
+ */
+ ptr = (char *)&data->u;
+ byteoff = xfs_dir2_byte_to_off(mp, curoff);
+ /*
+ * Skip past the header.
+ */
+ if (byteoff == 0)
+ curoff += (uint)sizeof(data->hdr);
+ /*
+ * Skip past entries until we reach our offset.
+ */
+ else {
+ while ((char *)ptr - (char *)data < byteoff) {
+ dup = (xfs_dir2_data_unused_t *)ptr;
+
+ if (be16_to_cpu(dup->freetag)
+ == XFS_DIR2_DATA_FREE_TAG) {
+
+ length = be16_to_cpu(dup->length);
+ ptr += length;
+ continue;
+ }
+ dep = (xfs_dir2_data_entry_t *)ptr;
+ length =
+ xfs_dir2_data_entsize(dep->namelen);
+ ptr += length;
+ }
+ /*
+ * Now set our real offset.
+ */
+ curoff =
+ xfs_dir2_db_off_to_byte(mp,
+ xfs_dir2_byte_to_db(mp, curoff),
+ (char *)ptr - (char *)data);
+ if (ptr >= (char *)data + mp->m_dirblksize) {
+ continue;
+ }
+ }
+ }
+ /*
+ * We have a pointer to an entry.
+ * Is it a live one?
+ */
+ dup = (xfs_dir2_data_unused_t *)ptr;
+ /*
+ * No, it's unused, skip over it.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ length = be16_to_cpu(dup->length);
+ ptr += length;
+ curoff += length;
+ continue;
+ }
+
+ dep = (xfs_dir2_data_entry_t *)ptr;
+ length = xfs_dir2_data_entsize(dep->namelen);
+
+ if (filldir(dirent, (char *)dep->name, dep->namelen,
+ xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
+ be64_to_cpu(dep->inumber), DT_UNKNOWN))
+ break;
+
+ /*
+ * Advance to next entry in the block.
+ */
+ ptr += length;
+ curoff += length;
+ /* bufsize may have just been a guess; don't go negative */
+ bufsize = bufsize > length ? bufsize - length : 0;
+ }
+
+ /*
+ * All done. Set output offset value to current offset.
+ */
+ if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
+ *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+ else
+ *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+ kmem_free(map);
+ if (bp)
+ xfs_da_brelse(NULL, bp);
+ return error;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+int
+xfs_dir2_leaf_init(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dir2_db_t bno, /* directory block number */
+ xfs_dabuf_t **bpp, /* out: leaf buffer */
+ int magic) /* magic number for block */
+{
+ xfs_dabuf_t *bp; /* leaf buffer */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ dp = args->dp;
+ ASSERT(dp != NULL);
+ tp = args->trans;
+ mp = dp->i_mount;
+ ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
+ bno < XFS_DIR2_FREE_FIRSTDB(mp));
+ /*
+ * Get the buffer for the block.
+ */
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
+ XFS_DATA_FORK);
+ if (error) {
+ return error;
+ }
+ ASSERT(bp != NULL);
+ leaf = bp->data;
+ /*
+ * Initialize the header.
+ */
+ leaf->hdr.info.magic = cpu_to_be16(magic);
+ leaf->hdr.info.forw = 0;
+ leaf->hdr.info.back = 0;
+ leaf->hdr.count = 0;
+ leaf->hdr.stale = 0;
+ xfs_dir2_leaf_log_header(tp, bp);
+ /*
+ * If it's a leaf-format directory initialize the tail.
+ * In this case our caller has the real bests table to copy into
+ * the block.
+ */
+ if (magic == XFS_DIR2_LEAF1_MAGIC) {
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp->bestcount = 0;
+ xfs_dir2_leaf_log_tail(tp, bp);
+ }
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Log the bests entries indicated from a leaf1 block.
+ */
+static void
+xfs_dir2_leaf_log_bests(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_dabuf_t *bp, /* leaf buffer */
+ int first, /* first entry to log */
+ int last) /* last entry to log */
+{
+ __be16 *firstb; /* pointer to first entry */
+ __be16 *lastb; /* pointer to last entry */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC);
+ ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf);
+ firstb = xfs_dir2_leaf_bests_p(ltp) + first;
+ lastb = xfs_dir2_leaf_bests_p(ltp) + last;
+ xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
+ (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
+}
+
+/*
+ * Log the leaf entries indicated from a leaf1 or leafn block.
+ */
+void
+xfs_dir2_leaf_log_ents(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_dabuf_t *bp, /* leaf buffer */
+ int first, /* first entry to log */
+ int last) /* last entry to log */
+{
+ xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */
+ xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC ||
+ be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ firstlep = &leaf->ents[first];
+ lastlep = &leaf->ents[last];
+ xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
+ (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
+}
+
+/*
+ * Log the header of the leaf1 or leafn block.
+ */
+void
+xfs_dir2_leaf_log_header(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_dabuf_t *bp) /* leaf buffer */
+{
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC ||
+ be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
+ (uint)(sizeof(leaf->hdr) - 1));
+}
+
+/*
+ * Log the tail of the leaf1 block.
+ */
+STATIC void
+xfs_dir2_leaf_log_tail(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_dabuf_t *bp) /* leaf buffer */
+{
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+ xfs_mount_t *mp; /* filesystem mount point */
+
+ mp = tp->t_mountp;
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC);
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
+ (uint)(mp->m_dirblksize - 1));
+}
+
+/*
+ * Look up the entry referred to by args in the leaf format directory.
+ * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
+ * is also used by the node-format code.
+ */
+int
+xfs_dir2_leaf_lookup(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_dabuf_t *dbp; /* data block buffer */
+ xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ int index; /* found entry index */
+ xfs_dabuf_t *lbp; /* leaf buffer */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ trace_xfs_dir2_leaf_lookup(args);
+
+ /*
+ * Look up name in the leaf block, returning both buffers and index.
+ */
+ if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+ return error;
+ }
+ tp = args->trans;
+ dp = args->dp;
+ xfs_dir2_leaf_check(dp, lbp);
+ leaf = lbp->data;
+ /*
+ * Get to the leaf entry and contained data entry address.
+ */
+ lep = &leaf->ents[index];
+ /*
+ * Point to the data entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)dbp->data +
+ xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
+ /*
+ * Return the found inode number & CI name if appropriate
+ */
+ args->inumber = be64_to_cpu(dep->inumber);
+ error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+ xfs_da_brelse(tp, dbp);
+ xfs_da_brelse(tp, lbp);
+ return XFS_ERROR(error);
+}
+
+/*
+ * Look up name/hash in the leaf block.
+ * Fill in indexp with the found index, and dbpp with the data buffer.
+ * If not found dbpp will be NULL, and ENOENT comes back.
+ * lbpp will always be filled in with the leaf buffer unless there's an error.
+ */
+static int /* error */
+xfs_dir2_leaf_lookup_int(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dabuf_t **lbpp, /* out: leaf buffer */
+ int *indexp, /* out: index in leaf block */
+ xfs_dabuf_t **dbpp) /* out: data buffer */
+{
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ xfs_dabuf_t *dbp = NULL; /* data buffer */
+ xfs_dir2_data_entry_t *dep; /* data entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ int index; /* index in leaf block */
+ xfs_dabuf_t *lbp; /* leaf buffer */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_db_t newdb; /* new data block number */
+ xfs_trans_t *tp; /* transaction pointer */
+ xfs_dir2_db_t cidb = -1; /* case match data block no. */
+ enum xfs_dacmp cmp; /* name compare result */
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ /*
+ * Read the leaf block into the buffer.
+ */
+ error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+ XFS_DATA_FORK);
+ if (error)
+ return error;
+ *lbpp = lbp;
+ leaf = lbp->data;
+ xfs_dir2_leaf_check(dp, lbp);
+ /*
+ * Look for the first leaf entry with our hash value.
+ */
+ index = xfs_dir2_leaf_search_hash(args, lbp);
+ /*
+ * Loop over all the entries with the right hash value
+ * looking to match the name.
+ */
+ for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+ be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
+ /*
+ * Skip over stale leaf entries.
+ */
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Get the new data block number.
+ */
+ newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ /*
+ * If it's not the same as the old data block number,
+ * need to pitch the old one and read the new one.
+ */
+ if (newdb != curdb) {
+ if (dbp)
+ xfs_da_brelse(tp, dbp);
+ error = xfs_da_read_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, newdb),
+ -1, &dbp, XFS_DATA_FORK);
+ if (error) {
+ xfs_da_brelse(tp, lbp);
+ return error;
+ }
+ xfs_dir2_data_check(dp, dbp);
+ curdb = newdb;
+ }
+ /*
+ * Point to the data entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)((char *)dbp->data +
+ xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ /*
+ * Compare name and if it's an exact match, return the index
+ * and buffer. If it's the first case-insensitive match, store
+ * the index and buffer and continue looking for an exact match.
+ */
+ cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
+ *indexp = index;
+ /* case exact match: return the current buffer. */
+ if (cmp == XFS_CMP_EXACT) {
+ *dbpp = dbp;
+ return 0;
+ }
+ cidb = curdb;
+ }
+ }
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ /*
+ * Here, we can only be doing a lookup (not a rename or remove).
+ * If a case-insensitive match was found earlier, re-read the
+ * appropriate data block if required and return it.
+ */
+ if (args->cmpresult == XFS_CMP_CASE) {
+ ASSERT(cidb != -1);
+ if (cidb != curdb) {
+ xfs_da_brelse(tp, dbp);
+ error = xfs_da_read_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, cidb),
+ -1, &dbp, XFS_DATA_FORK);
+ if (error) {
+ xfs_da_brelse(tp, lbp);
+ return error;
+ }
+ }
+ *dbpp = dbp;
+ return 0;
+ }
+ /*
+ * No match found, return ENOENT.
+ */
+ ASSERT(cidb == -1);
+ if (dbp)
+ xfs_da_brelse(tp, dbp);
+ xfs_da_brelse(tp, lbp);
+ return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Remove an entry from a leaf format directory.
+ */
+int /* error */
+xfs_dir2_leaf_removename(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ __be16 *bestsp; /* leaf block best freespace */
+ xfs_dir2_data_t *data; /* data block structure */
+ xfs_dir2_db_t db; /* data block number */
+ xfs_dabuf_t *dbp; /* data block buffer */
+ xfs_dir2_data_entry_t *dep; /* data entry structure */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ xfs_dir2_db_t i; /* temporary data block # */
+ int index; /* index into leaf entries */
+ xfs_dabuf_t *lbp; /* leaf buffer */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int needlog; /* need to log data header */
+ int needscan; /* need to rescan data frees */
+ xfs_dir2_data_off_t oldbest; /* old value of best free */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ trace_xfs_dir2_leaf_removename(args);
+
+ /*
+ * Lookup the leaf entry, get the leaf and data blocks read in.
+ */
+ if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+ return error;
+ }
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ leaf = lbp->data;
+ data = dbp->data;
+ xfs_dir2_data_check(dp, dbp);
+ /*
+ * Point to the leaf entry, use that to point to the data entry.
+ */
+ lep = &leaf->ents[index];
+ db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)data + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ needscan = needlog = 0;
+ oldbest = be16_to_cpu(data->hdr.bestfree[0].length);
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
+ /*
+ * Mark the former data entry unused.
+ */
+ xfs_dir2_data_make_free(tp, dbp,
+ (xfs_dir2_data_aoff_t)((char *)dep - (char *)data),
+ xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+ /*
+ * We just mark the leaf entry stale by putting a null in it.
+ */
+ be16_add_cpu(&leaf->hdr.stale, 1);
+ xfs_dir2_leaf_log_header(tp, lbp);
+ lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+ xfs_dir2_leaf_log_ents(tp, lbp, index, index);
+ /*
+ * Scan the freespace in the data block again if necessary,
+ * log the data block header if necessary.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(mp, data, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(tp, dbp);
+ /*
+ * If the longest freespace in the data block has changed,
+ * put the new value in the bests table and log that.
+ */
+ if (be16_to_cpu(data->hdr.bestfree[0].length) != oldbest) {
+ bestsp[db] = data->hdr.bestfree[0].length;
+ xfs_dir2_leaf_log_bests(tp, lbp, db, db);
+ }
+ xfs_dir2_data_check(dp, dbp);
+ /*
+ * If the data block is now empty then get rid of the data block.
+ */
+ if (be16_to_cpu(data->hdr.bestfree[0].length) ==
+ mp->m_dirblksize - (uint)sizeof(data->hdr)) {
+ ASSERT(db != mp->m_dirdatablk);
+ if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+ /*
+ * Nope, can't get rid of it because it caused
+ * allocation of a bmap btree block to do so.
+ * Just go on, returning success, leaving the
+ * empty block in place.
+ */
+ if (error == ENOSPC && args->total == 0) {
+ xfs_da_buf_done(dbp);
+ error = 0;
+ }
+ xfs_dir2_leaf_check(dp, lbp);
+ xfs_da_buf_done(lbp);
+ return error;
+ }
+ dbp = NULL;
+ /*
+ * If this is the last data block then compact the
+ * bests table by getting rid of entries.
+ */
+ if (db == be32_to_cpu(ltp->bestcount) - 1) {
+ /*
+ * Look for the last active entry (i).
+ */
+ for (i = db - 1; i > 0; i--) {
+ if (be16_to_cpu(bestsp[i]) != NULLDATAOFF)
+ break;
+ }
+ /*
+ * Copy the table down so inactive entries at the
+ * end are removed.
+ */
+ memmove(&bestsp[db - i], bestsp,
+ (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
+ be32_add_cpu(&ltp->bestcount, -(db - i));
+ xfs_dir2_leaf_log_tail(tp, lbp);
+ xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ } else
+ bestsp[db] = cpu_to_be16(NULLDATAOFF);
+ }
+ /*
+ * If the data block was not the first one, drop it.
+ */
+ else if (db != mp->m_dirdatablk && dbp != NULL) {
+ xfs_da_buf_done(dbp);
+ dbp = NULL;
+ }
+ xfs_dir2_leaf_check(dp, lbp);
+ /*
+ * See if we can convert to block form.
+ */
+ return xfs_dir2_leaf_to_block(args, lbp, dbp);
+}
+
+/*
+ * Replace the inode number in a leaf format directory entry.
+ */
+int /* error */
+xfs_dir2_leaf_replace(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_dabuf_t *dbp; /* data block buffer */
+ xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ int index; /* index of leaf entry */
+ xfs_dabuf_t *lbp; /* leaf buffer */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ trace_xfs_dir2_leaf_replace(args);
+
+ /*
+ * Look up the entry.
+ */
+ if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+ return error;
+ }
+ dp = args->dp;
+ leaf = lbp->data;
+ /*
+ * Point to the leaf entry, get data address from it.
+ */
+ lep = &leaf->ents[index];
+ /*
+ * Point to the data entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)dbp->data +
+ xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
+ ASSERT(args->inumber != be64_to_cpu(dep->inumber));
+ /*
+ * Put the new inode number in, log it.
+ */
+ dep->inumber = cpu_to_be64(args->inumber);
+ tp = args->trans;
+ xfs_dir2_data_log_entry(tp, dbp, dep);
+ xfs_da_buf_done(dbp);
+ xfs_dir2_leaf_check(dp, lbp);
+ xfs_da_brelse(tp, lbp);
+ return 0;
+}
+
+/*
+ * Return index in the leaf block (lbp) which is either the first
+ * one with this hash value, or if there are none, the insert point
+ * for that hash value.
+ */
+int /* index value */
+xfs_dir2_leaf_search_hash(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dabuf_t *lbp) /* leaf buffer */
+{
+ xfs_dahash_t hash=0; /* hash from this entry */
+ xfs_dahash_t hashwant; /* hash value looking for */
+ int high; /* high leaf index */
+ int low; /* low leaf index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ int mid=0; /* current leaf index */
+
+ leaf = lbp->data;
+#ifndef __KERNEL__
+ if (!leaf->hdr.count)
+ return 0;
+#endif
+ /*
+ * Note, the table cannot be empty, so we have to go through the loop.
+ * Binary search the leaf entries looking for our hash value.
+ */
+ for (lep = leaf->ents, low = 0, high = be16_to_cpu(leaf->hdr.count) - 1,
+ hashwant = args->hashval;
+ low <= high; ) {
+ mid = (low + high) >> 1;
+ if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant)
+ break;
+ if (hash < hashwant)
+ low = mid + 1;
+ else
+ high = mid - 1;
+ }
+ /*
+ * Found one, back up through all the equal hash values.
+ */
+ if (hash == hashwant) {
+ while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) {
+ mid--;
+ }
+ }
+ /*
+ * Need to point to an entry higher than ours.
+ */
+ else if (hash < hashwant)
+ mid++;
+ return mid;
+}
+
+/*
+ * Trim off a trailing data block. We know it's empty since the leaf
+ * freespace table says so.
+ */
+int /* error */
+xfs_dir2_leaf_trim_data(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dabuf_t *lbp, /* leaf buffer */
+ xfs_dir2_db_t db) /* data block number */
+{
+ __be16 *bestsp; /* leaf bests table */
+#ifdef DEBUG
+ xfs_dir2_data_t *data; /* data block structure */
+#endif
+ xfs_dabuf_t *dbp; /* data block buffer */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ tp = args->trans;
+ /*
+ * Read the offending data block. We need its buffer.
+ */
+ if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp,
+ XFS_DATA_FORK))) {
+ return error;
+ }
+#ifdef DEBUG
+ data = dbp->data;
+ ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC);
+#endif
+ /* this seems to be an error
+ * data is only valid if DEBUG is defined?
+ * RMC 09/08/1999
+ */
+
+ leaf = lbp->data;
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) ==
+ mp->m_dirblksize - (uint)sizeof(data->hdr));
+ ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
+ /*
+ * Get rid of the data block.
+ */
+ if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+ ASSERT(error != ENOSPC);
+ xfs_da_brelse(tp, dbp);
+ return error;
+ }
+ /*
+ * Eliminate the last bests entry from the table.
+ */
+ bestsp = xfs_dir2_leaf_bests_p(ltp);
+ be32_add_cpu(&ltp->bestcount, -1);
+ memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
+ xfs_dir2_leaf_log_tail(tp, lbp);
+ xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ return 0;
+}
+
+/*
+ * Convert node form directory to leaf form directory.
+ * The root of the node form dir needs to already be a LEAFN block.
+ * Just return if we can't do anything.
+ */
+int /* error */
+xfs_dir2_node_to_leaf(
+ xfs_da_state_t *state) /* directory operation state */
+{
+ xfs_da_args_t *args; /* operation arguments */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ xfs_dabuf_t *fbp; /* buffer for freespace block */
+ xfs_fileoff_t fo; /* freespace file offset */
+ xfs_dir2_free_t *free; /* freespace structure */
+ xfs_dabuf_t *lbp; /* buffer for leaf block */
+ xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int rval; /* successful free trim? */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ /*
+ * There's more than a leaf level in the btree, so there must
+ * be multiple leafn blocks. Give up.
+ */
+ if (state->path.active > 1)
+ return 0;
+ args = state->args;
+
+ trace_xfs_dir2_node_to_leaf(args);
+
+ mp = state->mp;
+ dp = args->dp;
+ tp = args->trans;
+ /*
+ * Get the last offset in the file.
+ */
+ if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) {
+ return error;
+ }
+ fo -= mp->m_dirblkfsbs;
+ /*
+ * If there are freespace blocks other than the first one,
+ * take this opportunity to remove trailing empty freespace blocks
+ * that may have been left behind during no-space-reservation
+ * operations.
+ */
+ while (fo > mp->m_dirfreeblk) {
+ if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
+ return error;
+ }
+ if (rval)
+ fo -= mp->m_dirblkfsbs;
+ else
+ return 0;
+ }
+ /*
+ * Now find the block just before the freespace block.
+ */
+ if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
+ return error;
+ }
+ /*
+ * If it's not the single leaf block, give up.
+ */
+ if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize)
+ return 0;
+ lbp = state->path.blk[0].bp;
+ leaf = lbp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ /*
+ * Read the freespace block.
+ */
+ if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
+ XFS_DATA_FORK))) {
+ return error;
+ }
+ free = fbp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+ ASSERT(!free->hdr.firstdb);
+ /*
+ * Now see if the leafn and free data will fit in a leaf1.
+ * If not, release the buffer and give up.
+ */
+ if ((uint)sizeof(leaf->hdr) +
+ (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)) * (uint)sizeof(leaf->ents[0]) +
+ be32_to_cpu(free->hdr.nvalid) * (uint)sizeof(leaf->bests[0]) +
+ (uint)sizeof(leaf->tail) >
+ mp->m_dirblksize) {
+ xfs_da_brelse(tp, fbp);
+ return 0;
+ }
+ /*
+ * If the leaf has any stale entries in it, compress them out.
+ * The compact routine will log the header.
+ */
+ if (be16_to_cpu(leaf->hdr.stale))
+ xfs_dir2_leaf_compact(args, lbp);
+ else
+ xfs_dir2_leaf_log_header(tp, lbp);
+ leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
+ /*
+ * Set up the leaf tail from the freespace block.
+ */
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp->bestcount = free->hdr.nvalid;
+ /*
+ * Set up the leaf bests table.
+ */
+ memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests,
+ be32_to_cpu(ltp->bestcount) * sizeof(leaf->bests[0]));
+ xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir2_leaf_log_tail(tp, lbp);
+ xfs_dir2_leaf_check(dp, lbp);
+ /*
+ * Get rid of the freespace block.
+ */
+ error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp);
+ if (error) {
+ /*
+ * This can't fail here because it can only happen when
+ * punching out the middle of an extent, and this is an
+ * isolated block.
+ */
+ ASSERT(error != ENOSPC);
+ return error;
+ }
+ fbp = NULL;
+ /*
+ * Now see if we can convert the single-leaf directory
+ * down to a block form directory.
+ * This routine always kills the dabuf for the leaf, so
+ * eliminate it from the path.
+ */
+ error = xfs_dir2_leaf_to_block(args, lbp, NULL);
+ state->path.blk[0].bp = NULL;
+ return error;
+}
diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h
new file mode 100644
index 00000000..6c9539f0
--- /dev/null
+++ b/fs/xfs/xfs_dir2_leaf.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DIR2_LEAF_H__
+#define __XFS_DIR2_LEAF_H__
+
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Offset of the leaf/node space. First block in this space
+ * is the btree root.
+ */
+#define XFS_DIR2_LEAF_SPACE 1
+#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_LEAF_FIRSTDB(mp) \
+ xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET)
+
+/*
+ * Offset in data space of a data entry.
+ */
+typedef __uint32_t xfs_dir2_dataptr_t;
+#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff)
+#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0)
+
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+ xfs_da_blkinfo_t info; /* header for da routines */
+ __be16 count; /* count of entries */
+ __be16 stale; /* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+ __be32 hashval; /* hash value of name */
+ __be32 address; /* address of data entry */
+} xfs_dir2_leaf_entry_t;
+
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+ __be32 bestcount;
+} xfs_dir2_leaf_tail_t;
+
+/*
+ * Leaf block.
+ * bests and tail are at the end of the block for single-leaf only
+ * (magic = XFS_DIR2_LEAF1_MAGIC not XFS_DIR2_LEAFN_MAGIC).
+ */
+typedef struct xfs_dir2_leaf {
+ xfs_dir2_leaf_hdr_t hdr; /* leaf header */
+ xfs_dir2_leaf_entry_t ents[1]; /* entries */
+ /* ... */
+ xfs_dir2_data_off_t bests[1]; /* best free counts */
+ xfs_dir2_leaf_tail_t tail; /* leaf tail */
+} xfs_dir2_leaf_t;
+
+/*
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp)
+{
+ return (int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) /
+ (uint)sizeof(xfs_dir2_leaf_entry_t));
+}
+
+/*
+ * Get address of the bestcount field in the single-leaf block.
+ */
+static inline xfs_dir2_leaf_tail_t *
+xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp)
+{
+ return (xfs_dir2_leaf_tail_t *)
+ ((char *)(lp) + (mp)->m_dirblksize -
+ (uint)sizeof(xfs_dir2_leaf_tail_t));
+}
+
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+static inline __be16 *
+xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp)
+{
+ return (__be16 *)ltp - be32_to_cpu(ltp->bestcount);
+}
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
+{
+ return (xfs_dir2_off_t)(dp) << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr. It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_dataptr_t)((by) >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_db_t)((by) >> \
+ ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog));
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_data_aoff_t)((by) & \
+ ((1 << ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return ((xfs_dir2_off_t)(db) << \
+ ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) + (o);
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+ return (xfs_dablk_t)((db) << (mp)->m_sb.sb_dirblklog);
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by)
+{
+ return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da)
+{
+ return (xfs_dir2_db_t)((da) >> (mp)->m_sb.sb_dirblklog);
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da)
+{
+ return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0);
+}
+
+/*
+ * Function declarations.
+ */
+extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
+ struct xfs_dabuf *dbp);
+extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
+extern void xfs_dir2_leaf_compact(struct xfs_da_args *args,
+ struct xfs_dabuf *bp);
+extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp,
+ int *lowstalep, int *highstalep,
+ int *lowlogp, int *highlogp);
+extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent,
+ size_t bufsize, xfs_off_t *offset,
+ filldir_t filldir);
+extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno,
+ struct xfs_dabuf **bpp, int magic);
+extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp,
+ int first, int last);
+extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp,
+ struct xfs_dabuf *bp);
+extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_replace(struct xfs_da_args *args);
+extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+ struct xfs_dabuf *lbp);
+extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
+ struct xfs_dabuf *lbp, xfs_dir2_db_t db);
+extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+
+#endif /* __XFS_DIR2_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
new file mode 100644
index 00000000..a0aab7d3
--- /dev/null
+++ b/fs/xfs/xfs_dir2_node.c
@@ -0,0 +1,2076 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+/*
+ * Function declarations.
+ */
+static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp);
+static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index);
+#ifdef DEBUG
+static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+#else
+#define xfs_dir2_leafn_check(dp, bp)
+#endif
+static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s,
+ int start_s, xfs_dabuf_t *bp_d, int start_d,
+ int count);
+static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
+ xfs_da_state_blk_t *blk1,
+ xfs_da_state_blk_t *blk2);
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp,
+ int index, xfs_da_state_blk_t *dblk,
+ int *rval);
+static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
+ xfs_da_state_blk_t *fblk);
+
+/*
+ * Log entries from a freespace block.
+ */
+STATIC void
+xfs_dir2_free_log_bests(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_dabuf_t *bp, /* freespace buffer */
+ int first, /* first entry to log */
+ int last) /* last entry to log */
+{
+ xfs_dir2_free_t *free; /* freespace structure */
+
+ free = bp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+ xfs_da_log_buf(tp, bp,
+ (uint)((char *)&free->bests[first] - (char *)free),
+ (uint)((char *)&free->bests[last] - (char *)free +
+ sizeof(free->bests[0]) - 1));
+}
+
+/*
+ * Log header from a freespace block.
+ */
+static void
+xfs_dir2_free_log_header(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_dabuf_t *bp) /* freespace buffer */
+{
+ xfs_dir2_free_t *free; /* freespace structure */
+
+ free = bp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+ xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
+ (uint)(sizeof(xfs_dir2_free_hdr_t) - 1));
+}
+
+/*
+ * Convert a leaf-format directory to a node-format directory.
+ * We need to change the magic number of the leaf block, and copy
+ * the freespace table out of the leaf block into its own block.
+ */
+int /* error */
+xfs_dir2_leaf_to_node(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dabuf_t *lbp) /* leaf buffer */
+{
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ xfs_dabuf_t *fbp; /* freespace buffer */
+ xfs_dir2_db_t fdb; /* freespace block number */
+ xfs_dir2_free_t *free; /* freespace structure */
+ __be16 *from; /* pointer to freespace entry */
+ int i; /* leaf freespace index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int n; /* count of live freespc ents */
+ xfs_dir2_data_off_t off; /* freespace entry value */
+ __be16 *to; /* pointer to freespace entry */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ trace_xfs_dir2_leaf_to_node(args);
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ tp = args->trans;
+ /*
+ * Add a freespace block to the directory.
+ */
+ if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
+ return error;
+ }
+ ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp));
+ /*
+ * Get the buffer for the new freespace block.
+ */
+ if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
+ XFS_DATA_FORK))) {
+ return error;
+ }
+ ASSERT(fbp != NULL);
+ free = fbp->data;
+ leaf = lbp->data;
+ ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ /*
+ * Initialize the freespace block header.
+ */
+ free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC);
+ free->hdr.firstdb = 0;
+ ASSERT(be32_to_cpu(ltp->bestcount) <= (uint)dp->i_d.di_size / mp->m_dirblksize);
+ free->hdr.nvalid = ltp->bestcount;
+ /*
+ * Copy freespace entries from the leaf block to the new block.
+ * Count active entries.
+ */
+ for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests;
+ i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
+ if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
+ n++;
+ *to = cpu_to_be16(off);
+ }
+ free->hdr.nused = cpu_to_be32(n);
+ leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+ /*
+ * Log everything.
+ */
+ xfs_dir2_leaf_log_header(tp, lbp);
+ xfs_dir2_free_log_header(tp, fbp);
+ xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1);
+ xfs_da_buf_done(fbp);
+ xfs_dir2_leafn_check(dp, lbp);
+ return 0;
+}
+
+/*
+ * Add a leaf entry to a leaf block in a node-form directory.
+ * The other work necessary is done from the caller.
+ */
+static int /* error */
+xfs_dir2_leafn_add(
+ xfs_dabuf_t *bp, /* leaf buffer */
+ xfs_da_args_t *args, /* operation arguments */
+ int index) /* insertion pt for new entry */
+{
+ int compact; /* compacting stale leaves */
+ xfs_inode_t *dp; /* incore directory inode */
+ int highstale; /* next stale entry */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ int lfloghigh; /* high leaf entry logging */
+ int lfloglow; /* low leaf entry logging */
+ int lowstale; /* previous stale entry */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ trace_xfs_dir2_leafn_add(args, index);
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ tp = args->trans;
+ leaf = bp->data;
+
+ /*
+ * Quick check just to make sure we are not going to index
+ * into other peoples memory
+ */
+ if (index < 0)
+ return XFS_ERROR(EFSCORRUPTED);
+
+ /*
+ * If there are already the maximum number of leaf entries in
+ * the block, if there are no stale entries it won't fit.
+ * Caller will do a split. If there are stale entries we'll do
+ * a compact.
+ */
+
+ if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) {
+ if (!leaf->hdr.stale)
+ return XFS_ERROR(ENOSPC);
+ compact = be16_to_cpu(leaf->hdr.stale) > 1;
+ } else
+ compact = 0;
+ ASSERT(index == 0 || be32_to_cpu(leaf->ents[index - 1].hashval) <= args->hashval);
+ ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
+ be32_to_cpu(leaf->ents[index].hashval) >= args->hashval);
+
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+ return 0;
+
+ /*
+ * Compact out all but one stale leaf entry. Leaves behind
+ * the entry closest to index.
+ */
+ if (compact) {
+ xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale,
+ &lfloglow, &lfloghigh);
+ }
+ /*
+ * Set impossible logging indices for this case.
+ */
+ else if (leaf->hdr.stale) {
+ lfloglow = be16_to_cpu(leaf->hdr.count);
+ lfloghigh = -1;
+ }
+ /*
+ * No stale entries, just insert a space for the new entry.
+ */
+ if (!leaf->hdr.stale) {
+ lep = &leaf->ents[index];
+ if (index < be16_to_cpu(leaf->hdr.count))
+ memmove(lep + 1, lep,
+ (be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep));
+ lfloglow = index;
+ lfloghigh = be16_to_cpu(leaf->hdr.count);
+ be16_add_cpu(&leaf->hdr.count, 1);
+ }
+ /*
+ * There are stale entries. We'll use one for the new entry.
+ */
+ else {
+ /*
+ * If we didn't do a compact then we need to figure out
+ * which stale entry will be used.
+ */
+ if (compact == 0) {
+ /*
+ * Find first stale entry before our insertion point.
+ */
+ for (lowstale = index - 1;
+ lowstale >= 0 &&
+ be32_to_cpu(leaf->ents[lowstale].address) !=
+ XFS_DIR2_NULL_DATAPTR;
+ lowstale--)
+ continue;
+ /*
+ * Find next stale entry after insertion point.
+ * Stop looking if the answer would be worse than
+ * lowstale already found.
+ */
+ for (highstale = index;
+ highstale < be16_to_cpu(leaf->hdr.count) &&
+ be32_to_cpu(leaf->ents[highstale].address) !=
+ XFS_DIR2_NULL_DATAPTR &&
+ (lowstale < 0 ||
+ index - lowstale - 1 >= highstale - index);
+ highstale++)
+ continue;
+ }
+ /*
+ * Using the low stale entry.
+ * Shift entries up toward the stale slot.
+ */
+ if (lowstale >= 0 &&
+ (highstale == be16_to_cpu(leaf->hdr.count) ||
+ index - lowstale - 1 < highstale - index)) {
+ ASSERT(be32_to_cpu(leaf->ents[lowstale].address) ==
+ XFS_DIR2_NULL_DATAPTR);
+ ASSERT(index - lowstale - 1 >= 0);
+ if (index - lowstale - 1 > 0)
+ memmove(&leaf->ents[lowstale],
+ &leaf->ents[lowstale + 1],
+ (index - lowstale - 1) * sizeof(*lep));
+ lep = &leaf->ents[index - 1];
+ lfloglow = MIN(lowstale, lfloglow);
+ lfloghigh = MAX(index - 1, lfloghigh);
+ }
+ /*
+ * Using the high stale entry.
+ * Shift entries down toward the stale slot.
+ */
+ else {
+ ASSERT(be32_to_cpu(leaf->ents[highstale].address) ==
+ XFS_DIR2_NULL_DATAPTR);
+ ASSERT(highstale - index >= 0);
+ if (highstale - index > 0)
+ memmove(&leaf->ents[index + 1],
+ &leaf->ents[index],
+ (highstale - index) * sizeof(*lep));
+ lep = &leaf->ents[index];
+ lfloglow = MIN(index, lfloglow);
+ lfloghigh = MAX(highstale, lfloghigh);
+ }
+ be16_add_cpu(&leaf->hdr.stale, -1);
+ }
+ /*
+ * Insert the new entry, log everything.
+ */
+ lep->hashval = cpu_to_be32(args->hashval);
+ lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp,
+ args->blkno, args->index));
+ xfs_dir2_leaf_log_header(tp, bp);
+ xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
+ xfs_dir2_leafn_check(dp, bp);
+ return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check internal consistency of a leafn block.
+ */
+void
+xfs_dir2_leafn_check(
+ xfs_inode_t *dp, /* incore directory inode */
+ xfs_dabuf_t *bp) /* leaf buffer */
+{
+ int i; /* leaf index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int stale; /* count of stale leaves */
+
+ leaf = bp->data;
+ mp = dp->i_mount;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
+ for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) {
+ if (i + 1 < be16_to_cpu(leaf->hdr.count)) {
+ ASSERT(be32_to_cpu(leaf->ents[i].hashval) <=
+ be32_to_cpu(leaf->ents[i + 1].hashval));
+ }
+ if (be32_to_cpu(leaf->ents[i].address) == XFS_DIR2_NULL_DATAPTR)
+ stale++;
+ }
+ ASSERT(be16_to_cpu(leaf->hdr.stale) == stale);
+}
+#endif /* DEBUG */
+
+/*
+ * Return the last hash value in the leaf.
+ * Stale entries are ok.
+ */
+xfs_dahash_t /* hash value */
+xfs_dir2_leafn_lasthash(
+ xfs_dabuf_t *bp, /* leaf buffer */
+ int *count) /* count of entries in leaf */
+{
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ if (count)
+ *count = be16_to_cpu(leaf->hdr.count);
+ if (!leaf->hdr.count)
+ return 0;
+ return be32_to_cpu(leaf->ents[be16_to_cpu(leaf->hdr.count) - 1].hashval);
+}
+
+/*
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
+ * The extrablk in state is a freespace block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_addname(
+ xfs_dabuf_t *bp, /* leaf buffer */
+ xfs_da_args_t *args, /* operation arguments */
+ int *indexp, /* out: leaf entry index */
+ xfs_da_state_t *state) /* state to fill in */
+{
+ xfs_dabuf_t *curbp = NULL; /* current data/free buffer */
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ xfs_dir2_db_t curfdb = -1; /* current free block number */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ int fi; /* free entry index */
+ xfs_dir2_free_t *free = NULL; /* free block structure */
+ int index; /* leaf entry index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ int length; /* length of new data entry */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_db_t newdb; /* new data block number */
+ xfs_dir2_db_t newfdb; /* new free block number */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+#ifdef __KERNEL__
+ ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
+#endif
+ xfs_dir2_leafn_check(dp, bp);
+ /*
+ * Look up the hash value in the leaf entries.
+ */
+ index = xfs_dir2_leaf_search_hash(args, bp);
+ /*
+ * Do we have a buffer coming in?
+ */
+ if (state->extravalid) {
+ /* If so, it's a free block buffer, get the block number. */
+ curbp = state->extrablk.bp;
+ curfdb = state->extrablk.blkno;
+ free = curbp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+ }
+ length = xfs_dir2_data_entsize(args->namelen);
+ /*
+ * Loop over leaf entries with the right hash value.
+ */
+ for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+ be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
+ /*
+ * Skip stale leaf entries.
+ */
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Pull the data block number from the entry.
+ */
+ newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ /*
+ * For addname, we're looking for a place to put the new entry.
+ * We want to use a data block with an entry of equal
+ * hash value to ours if there is one with room.
+ *
+ * If this block isn't the data block we already have
+ * in hand, take a look at it.
+ */
+ if (newdb != curdb) {
+ curdb = newdb;
+ /*
+ * Convert the data block to the free block
+ * holding its freespace information.
+ */
+ newfdb = xfs_dir2_db_to_fdb(mp, newdb);
+ /*
+ * If it's not the one we have in hand, read it in.
+ */
+ if (newfdb != curfdb) {
+ /*
+ * If we had one before, drop it.
+ */
+ if (curbp)
+ xfs_da_brelse(tp, curbp);
+ /*
+ * Read the free block.
+ */
+ error = xfs_da_read_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, newfdb),
+ -1, &curbp, XFS_DATA_FORK);
+ if (error)
+ return error;
+ free = curbp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) ==
+ XFS_DIR2_FREE_MAGIC);
+ ASSERT((be32_to_cpu(free->hdr.firstdb) %
+ XFS_DIR2_MAX_FREE_BESTS(mp)) == 0);
+ ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb);
+ ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) +
+ be32_to_cpu(free->hdr.nvalid));
+ }
+ /*
+ * Get the index for our entry.
+ */
+ fi = xfs_dir2_db_to_fdindex(mp, curdb);
+ /*
+ * If it has room, return it.
+ */
+ if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) {
+ XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
+ XFS_ERRLEVEL_LOW, mp);
+ if (curfdb != newfdb)
+ xfs_da_brelse(tp, curbp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ curfdb = newfdb;
+ if (be16_to_cpu(free->bests[fi]) >= length)
+ goto out;
+ }
+ }
+ /* Didn't find any space */
+ fi = -1;
+out:
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ if (curbp) {
+ /* Giving back a free block. */
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
+ state->extrablk.index = fi;
+ state->extrablk.blkno = curfdb;
+ state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+ } else {
+ state->extravalid = 0;
+ }
+ /*
+ * Return the index, that will be the insertion point.
+ */
+ *indexp = index;
+ return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+ xfs_dabuf_t *bp, /* leaf buffer */
+ xfs_da_args_t *args, /* operation arguments */
+ int *indexp, /* out: leaf entry index */
+ xfs_da_state_t *state) /* state to fill in */
+{
+ xfs_dabuf_t *curbp = NULL; /* current data/free buffer */
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ int index; /* leaf entry index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_db_t newdb; /* new data block number */
+ xfs_trans_t *tp; /* transaction pointer */
+ enum xfs_dacmp cmp; /* comparison result */
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+#ifdef __KERNEL__
+ ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
+#endif
+ xfs_dir2_leafn_check(dp, bp);
+ /*
+ * Look up the hash value in the leaf entries.
+ */
+ index = xfs_dir2_leaf_search_hash(args, bp);
+ /*
+ * Do we have a buffer coming in?
+ */
+ if (state->extravalid) {
+ curbp = state->extrablk.bp;
+ curdb = state->extrablk.blkno;
+ }
+ /*
+ * Loop over leaf entries with the right hash value.
+ */
+ for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+ be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
+ /*
+ * Skip stale leaf entries.
+ */
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Pull the data block number from the entry.
+ */
+ newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ /*
+ * Not adding a new entry, so we really want to find
+ * the name given to us.
+ *
+ * If it's a different data block, go get it.
+ */
+ if (newdb != curdb) {
+ /*
+ * If we had a block before that we aren't saving
+ * for a CI name, drop it
+ */
+ if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+ curdb != state->extrablk.blkno))
+ xfs_da_brelse(tp, curbp);
+ /*
+ * If needing the block that is saved with a CI match,
+ * use it otherwise read in the new data block.
+ */
+ if (args->cmpresult != XFS_CMP_DIFFERENT &&
+ newdb == state->extrablk.blkno) {
+ ASSERT(state->extravalid);
+ curbp = state->extrablk.bp;
+ } else {
+ error = xfs_da_read_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, newdb),
+ -1, &curbp, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+ xfs_dir2_data_check(dp, curbp);
+ curdb = newdb;
+ }
+ /*
+ * Point to the data entry.
+ */
+ dep = (xfs_dir2_data_entry_t *)((char *)curbp->data +
+ xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ /*
+ * Compare the entry and if it's an exact match, return
+ * EEXIST immediately. If it's the first case-insensitive
+ * match, store the block & inode number and continue looking.
+ */
+ cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ /* If there is a CI match block, drop it */
+ if (args->cmpresult != XFS_CMP_DIFFERENT &&
+ curdb != state->extrablk.blkno)
+ xfs_da_brelse(tp, state->extrablk.bp);
+ args->cmpresult = cmp;
+ args->inumber = be64_to_cpu(dep->inumber);
+ *indexp = index;
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
+ state->extrablk.blkno = curdb;
+ state->extrablk.index = (int)((char *)dep -
+ (char *)curbp->data);
+ state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ if (cmp == XFS_CMP_EXACT)
+ return XFS_ERROR(EEXIST);
+ }
+ }
+ ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
+ (args->op_flags & XFS_DA_OP_OKNOENT));
+ if (curbp) {
+ if (args->cmpresult == XFS_CMP_DIFFERENT) {
+ /* Giving back last used data block. */
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
+ state->extrablk.index = -1;
+ state->extrablk.blkno = curdb;
+ state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ } else {
+ /* If the curbp is not the CI match block, drop it */
+ if (state->extrablk.bp != curbp)
+ xfs_da_brelse(tp, curbp);
+ }
+ } else {
+ state->extravalid = 0;
+ }
+ *indexp = index;
+ return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+ xfs_dabuf_t *bp, /* leaf buffer */
+ xfs_da_args_t *args, /* operation arguments */
+ int *indexp, /* out: leaf entry index */
+ xfs_da_state_t *state) /* state to fill in */
+{
+ if (args->op_flags & XFS_DA_OP_ADDNAME)
+ return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+ state);
+ return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+
+/*
+ * Move count leaf entries from source to destination leaf.
+ * Log entries and headers. Stale entries are preserved.
+ */
+static void
+xfs_dir2_leafn_moveents(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dabuf_t *bp_s, /* source leaf buffer */
+ int start_s, /* source leaf index */
+ xfs_dabuf_t *bp_d, /* destination leaf buffer */
+ int start_d, /* destination leaf index */
+ int count) /* count of leaves to copy */
+{
+ xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */
+ xfs_dir2_leaf_t *leaf_s; /* source leaf structure */
+ int stale; /* count stale leaves copied */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
+
+ /*
+ * Silently return if nothing to do.
+ */
+ if (count == 0) {
+ return;
+ }
+ tp = args->trans;
+ leaf_s = bp_s->data;
+ leaf_d = bp_d->data;
+ /*
+ * If the destination index is not the end of the current
+ * destination leaf entries, open up a hole in the destination
+ * to hold the new entries.
+ */
+ if (start_d < be16_to_cpu(leaf_d->hdr.count)) {
+ memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d],
+ (be16_to_cpu(leaf_d->hdr.count) - start_d) *
+ sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count,
+ count + be16_to_cpu(leaf_d->hdr.count) - 1);
+ }
+ /*
+ * If the source has stale leaves, count the ones in the copy range
+ * so we can update the header correctly.
+ */
+ if (leaf_s->hdr.stale) {
+ int i; /* temp leaf index */
+
+ for (i = start_s, stale = 0; i < start_s + count; i++) {
+ if (be32_to_cpu(leaf_s->ents[i].address) == XFS_DIR2_NULL_DATAPTR)
+ stale++;
+ }
+ } else
+ stale = 0;
+ /*
+ * Copy the leaf entries from source to destination.
+ */
+ memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s],
+ count * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+ /*
+ * If there are source entries after the ones we copied,
+ * delete the ones we copied by sliding the next ones down.
+ */
+ if (start_s + count < be16_to_cpu(leaf_s->hdr.count)) {
+ memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count],
+ count * sizeof(xfs_dir2_leaf_entry_t));
+ xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
+ }
+ /*
+ * Update the headers and log them.
+ */
+ be16_add_cpu(&leaf_s->hdr.count, -(count));
+ be16_add_cpu(&leaf_s->hdr.stale, -(stale));
+ be16_add_cpu(&leaf_d->hdr.count, count);
+ be16_add_cpu(&leaf_d->hdr.stale, stale);
+ xfs_dir2_leaf_log_header(tp, bp_s);
+ xfs_dir2_leaf_log_header(tp, bp_d);
+ xfs_dir2_leafn_check(args->dp, bp_s);
+ xfs_dir2_leafn_check(args->dp, bp_d);
+}
+
+/*
+ * Determine the sort order of two leaf blocks.
+ * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
+ */
+int /* sort order */
+xfs_dir2_leafn_order(
+ xfs_dabuf_t *leaf1_bp, /* leaf1 buffer */
+ xfs_dabuf_t *leaf2_bp) /* leaf2 buffer */
+{
+ xfs_dir2_leaf_t *leaf1; /* leaf1 structure */
+ xfs_dir2_leaf_t *leaf2; /* leaf2 structure */
+
+ leaf1 = leaf1_bp->data;
+ leaf2 = leaf2_bp->data;
+ ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ if (be16_to_cpu(leaf1->hdr.count) > 0 &&
+ be16_to_cpu(leaf2->hdr.count) > 0 &&
+ (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) ||
+ be32_to_cpu(leaf2->ents[be16_to_cpu(leaf2->hdr.count) - 1].hashval) <
+ be32_to_cpu(leaf1->ents[be16_to_cpu(leaf1->hdr.count) - 1].hashval)))
+ return 1;
+ return 0;
+}
+
+/*
+ * Rebalance leaf entries between two leaf blocks.
+ * This is actually only called when the second block is new,
+ * though the code deals with the general case.
+ * A new entry will be inserted in one of the blocks, and that
+ * entry is taken into account when balancing.
+ */
+static void
+xfs_dir2_leafn_rebalance(
+ xfs_da_state_t *state, /* btree cursor */
+ xfs_da_state_blk_t *blk1, /* first btree block */
+ xfs_da_state_blk_t *blk2) /* second btree block */
+{
+ xfs_da_args_t *args; /* operation arguments */
+ int count; /* count (& direction) leaves */
+ int isleft; /* new goes in left leaf */
+ xfs_dir2_leaf_t *leaf1; /* first leaf structure */
+ xfs_dir2_leaf_t *leaf2; /* second leaf structure */
+ int mid; /* midpoint leaf index */
+#ifdef DEBUG
+ int oldstale; /* old count of stale leaves */
+#endif
+ int oldsum; /* old total leaf count */
+ int swap; /* swapped leaf blocks */
+
+ args = state->args;
+ /*
+ * If the block order is wrong, swap the arguments.
+ */
+ if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) {
+ xfs_da_state_blk_t *tmp; /* temp for block swap */
+
+ tmp = blk1;
+ blk1 = blk2;
+ blk2 = tmp;
+ }
+ leaf1 = blk1->bp->data;
+ leaf2 = blk2->bp->data;
+ oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count);
+#ifdef DEBUG
+ oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale);
+#endif
+ mid = oldsum >> 1;
+ /*
+ * If the old leaf count was odd then the new one will be even,
+ * so we need to divide the new count evenly.
+ */
+ if (oldsum & 1) {
+ xfs_dahash_t midhash; /* middle entry hash value */
+
+ if (mid >= be16_to_cpu(leaf1->hdr.count))
+ midhash = be32_to_cpu(leaf2->ents[mid - be16_to_cpu(leaf1->hdr.count)].hashval);
+ else
+ midhash = be32_to_cpu(leaf1->ents[mid].hashval);
+ isleft = args->hashval <= midhash;
+ }
+ /*
+ * If the old count is even then the new count is odd, so there's
+ * no preferred side for the new entry.
+ * Pick the left one.
+ */
+ else
+ isleft = 1;
+ /*
+ * Calculate moved entry count. Positive means left-to-right,
+ * negative means right-to-left. Then move the entries.
+ */
+ count = be16_to_cpu(leaf1->hdr.count) - mid + (isleft == 0);
+ if (count > 0)
+ xfs_dir2_leafn_moveents(args, blk1->bp,
+ be16_to_cpu(leaf1->hdr.count) - count, blk2->bp, 0, count);
+ else if (count < 0)
+ xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp,
+ be16_to_cpu(leaf1->hdr.count), count);
+ ASSERT(be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count) == oldsum);
+ ASSERT(be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale) == oldstale);
+ /*
+ * Mark whether we're inserting into the old or new leaf.
+ */
+ if (be16_to_cpu(leaf1->hdr.count) < be16_to_cpu(leaf2->hdr.count))
+ state->inleaf = swap;
+ else if (be16_to_cpu(leaf1->hdr.count) > be16_to_cpu(leaf2->hdr.count))
+ state->inleaf = !swap;
+ else
+ state->inleaf =
+ swap ^ (blk1->index <= be16_to_cpu(leaf1->hdr.count));
+ /*
+ * Adjust the expected index for insertion.
+ */
+ if (!state->inleaf)
+ blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
+
+ /*
+ * Finally sanity check just to make sure we are not returning a
+ * negative index
+ */
+ if(blk2->index < 0) {
+ state->inleaf = 1;
+ blk2->index = 0;
+ xfs_alert(args->dp->i_mount,
+ "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
+ __func__, blk1->index);
+ }
+}
+
+/*
+ * Remove an entry from a node directory.
+ * This removes the leaf entry and the data entry,
+ * and updates the free block if necessary.
+ */
+static int /* error */
+xfs_dir2_leafn_remove(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dabuf_t *bp, /* leaf buffer */
+ int index, /* leaf entry index */
+ xfs_da_state_blk_t *dblk, /* data block */
+ int *rval) /* resulting block needs join */
+{
+ xfs_dir2_data_t *data; /* data block structure */
+ xfs_dir2_db_t db; /* data block number */
+ xfs_dabuf_t *dbp; /* data block buffer */
+ xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ int longest; /* longest data free entry */
+ int off; /* data block entry offset */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int needlog; /* need to log data header */
+ int needscan; /* need to rescan data frees */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ trace_xfs_dir2_leafn_remove(args, index);
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ /*
+ * Point to the entry we're removing.
+ */
+ lep = &leaf->ents[index];
+ /*
+ * Extract the data block and offset from the entry.
+ */
+ db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ ASSERT(dblk->blkno == db);
+ off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address));
+ ASSERT(dblk->index == off);
+ /*
+ * Kill the leaf entry by marking it stale.
+ * Log the leaf block changes.
+ */
+ be16_add_cpu(&leaf->hdr.stale, 1);
+ xfs_dir2_leaf_log_header(tp, bp);
+ lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
+ xfs_dir2_leaf_log_ents(tp, bp, index, index);
+ /*
+ * Make the data entry free. Keep track of the longest freespace
+ * in the data block in case it changes.
+ */
+ dbp = dblk->bp;
+ data = dbp->data;
+ dep = (xfs_dir2_data_entry_t *)((char *)data + off);
+ longest = be16_to_cpu(data->hdr.bestfree[0].length);
+ needlog = needscan = 0;
+ xfs_dir2_data_make_free(tp, dbp, off,
+ xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+ /*
+ * Rescan the data block freespaces for bestfree.
+ * Log the data block header if needed.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(mp, data, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_check(dp, dbp);
+ /*
+ * If the longest data block freespace changes, need to update
+ * the corresponding freeblock entry.
+ */
+ if (longest < be16_to_cpu(data->hdr.bestfree[0].length)) {
+ int error; /* error return value */
+ xfs_dabuf_t *fbp; /* freeblock buffer */
+ xfs_dir2_db_t fdb; /* freeblock block number */
+ int findex; /* index in freeblock entries */
+ xfs_dir2_free_t *free; /* freeblock structure */
+ int logfree; /* need to log free entry */
+
+ /*
+ * Convert the data block number to a free block,
+ * read in the free block.
+ */
+ fdb = xfs_dir2_db_to_fdb(mp, db);
+ if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+ -1, &fbp, XFS_DATA_FORK))) {
+ return error;
+ }
+ free = fbp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+ ASSERT(be32_to_cpu(free->hdr.firstdb) ==
+ XFS_DIR2_MAX_FREE_BESTS(mp) *
+ (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+ /*
+ * Calculate which entry we need to fix.
+ */
+ findex = xfs_dir2_db_to_fdindex(mp, db);
+ longest = be16_to_cpu(data->hdr.bestfree[0].length);
+ /*
+ * If the data block is now empty we can get rid of it
+ * (usually).
+ */
+ if (longest == mp->m_dirblksize - (uint)sizeof(data->hdr)) {
+ /*
+ * Try to punch out the data block.
+ */
+ error = xfs_dir2_shrink_inode(args, db, dbp);
+ if (error == 0) {
+ dblk->bp = NULL;
+ data = NULL;
+ }
+ /*
+ * We can get ENOSPC if there's no space reservation.
+ * In this case just drop the buffer and some one else
+ * will eventually get rid of the empty block.
+ */
+ else if (error == ENOSPC && args->total == 0)
+ xfs_da_buf_done(dbp);
+ else
+ return error;
+ }
+ /*
+ * If we got rid of the data block, we can eliminate that entry
+ * in the free block.
+ */
+ if (data == NULL) {
+ /*
+ * One less used entry in the free table.
+ */
+ be32_add_cpu(&free->hdr.nused, -1);
+ xfs_dir2_free_log_header(tp, fbp);
+ /*
+ * If this was the last entry in the table, we can
+ * trim the table size back. There might be other
+ * entries at the end referring to non-existent
+ * data blocks, get those too.
+ */
+ if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
+ int i; /* free entry index */
+
+ for (i = findex - 1;
+ i >= 0 && be16_to_cpu(free->bests[i]) == NULLDATAOFF;
+ i--)
+ continue;
+ free->hdr.nvalid = cpu_to_be32(i + 1);
+ logfree = 0;
+ }
+ /*
+ * Not the last entry, just punch it out.
+ */
+ else {
+ free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+ logfree = 1;
+ }
+ /*
+ * If there are no useful entries left in the block,
+ * get rid of the block if we can.
+ */
+ if (!free->hdr.nused) {
+ error = xfs_dir2_shrink_inode(args, fdb, fbp);
+ if (error == 0) {
+ fbp = NULL;
+ logfree = 0;
+ } else if (error != ENOSPC || args->total != 0)
+ return error;
+ /*
+ * It's possible to get ENOSPC if there is no
+ * space reservation. In this case some one
+ * else will eventually get rid of this block.
+ */
+ }
+ }
+ /*
+ * Data block is not empty, just set the free entry to
+ * the new value.
+ */
+ else {
+ free->bests[findex] = cpu_to_be16(longest);
+ logfree = 1;
+ }
+ /*
+ * Log the free entry that changed, unless we got rid of it.
+ */
+ if (logfree)
+ xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+ /*
+ * Drop the buffer if we still have it.
+ */
+ if (fbp)
+ xfs_da_buf_done(fbp);
+ }
+ xfs_dir2_leafn_check(dp, bp);
+ /*
+ * Return indication of whether this leaf block is empty enough
+ * to justify trying to join it with a neighbor.
+ */
+ *rval =
+ ((uint)sizeof(leaf->hdr) +
+ (uint)sizeof(leaf->ents[0]) *
+ (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale))) <
+ mp->m_dir_magicpct;
+ return 0;
+}
+
+/*
+ * Split the leaf entries in the old block into old and new blocks.
+ */
+int /* error */
+xfs_dir2_leafn_split(
+ xfs_da_state_t *state, /* btree cursor */
+ xfs_da_state_blk_t *oldblk, /* original block */
+ xfs_da_state_blk_t *newblk) /* newly created block */
+{
+ xfs_da_args_t *args; /* operation arguments */
+ xfs_dablk_t blkno; /* new leaf block number */
+ int error; /* error return value */
+ xfs_mount_t *mp; /* filesystem mount point */
+
+ /*
+ * Allocate space for a new leaf node.
+ */
+ args = state->args;
+ mp = args->dp->i_mount;
+ ASSERT(args != NULL);
+ ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
+ error = xfs_da_grow_inode(args, &blkno);
+ if (error) {
+ return error;
+ }
+ /*
+ * Initialize the new leaf block.
+ */
+ error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno),
+ &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+ if (error) {
+ return error;
+ }
+ newblk->blkno = blkno;
+ newblk->magic = XFS_DIR2_LEAFN_MAGIC;
+ /*
+ * Rebalance the entries across the two leaves, link the new
+ * block into the leaves.
+ */
+ xfs_dir2_leafn_rebalance(state, oldblk, newblk);
+ error = xfs_da_blk_link(state, oldblk, newblk);
+ if (error) {
+ return error;
+ }
+ /*
+ * Insert the new entry in the correct block.
+ */
+ if (state->inleaf)
+ error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
+ else
+ error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
+ /*
+ * Update last hashval in each block since we added the name.
+ */
+ oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
+ newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
+ xfs_dir2_leafn_check(args->dp, oldblk->bp);
+ xfs_dir2_leafn_check(args->dp, newblk->bp);
+ return error;
+}
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor. Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int /* error */
+xfs_dir2_leafn_toosmall(
+ xfs_da_state_t *state, /* btree cursor */
+ int *action) /* resulting action to take */
+{
+ xfs_da_state_blk_t *blk; /* leaf block */
+ xfs_dablk_t blkno; /* leaf block number */
+ xfs_dabuf_t *bp; /* leaf buffer */
+ int bytes; /* bytes in use */
+ int count; /* leaf live entry count */
+ int error; /* error return value */
+ int forward; /* sibling block direction */
+ int i; /* sibling counter */
+ xfs_da_blkinfo_t *info; /* leaf block header */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ int rval; /* result from path_shift */
+
+ /*
+ * Check for the degenerate case of the block being over 50% full.
+ * If so, it's not worth even looking to see if we might be able
+ * to coalesce with a sibling.
+ */
+ blk = &state->path.blk[state->path.active - 1];
+ info = blk->bp->data;
+ ASSERT(be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC);
+ leaf = (xfs_dir2_leaf_t *)info;
+ count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
+ bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]);
+ if (bytes > (state->blocksize >> 1)) {
+ /*
+ * Blk over 50%, don't try to join.
+ */
+ *action = 0;
+ return 0;
+ }
+ /*
+ * Check for the degenerate case of the block being empty.
+ * If the block is empty, we'll simply delete it, no need to
+ * coalesce it with a sibling block. We choose (arbitrarily)
+ * to merge with the forward block unless it is NULL.
+ */
+ if (count == 0) {
+ /*
+ * Make altpath point to the block we want to keep and
+ * path point to the block we want to drop (this one).
+ */
+ forward = (info->forw != 0);
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+ &rval);
+ if (error)
+ return error;
+ *action = rval ? 2 : 0;
+ return 0;
+ }
+ /*
+ * Examine each sibling block to see if we can coalesce with
+ * at least 25% free space to spare. We need to figure out
+ * whether to merge with the forward or the backward block.
+ * We prefer coalescing with the lower numbered sibling so as
+ * to shrink a directory over time.
+ */
+ forward = be32_to_cpu(info->forw) < be32_to_cpu(info->back);
+ for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
+ blkno = forward ? be32_to_cpu(info->forw) : be32_to_cpu(info->back);
+ if (blkno == 0)
+ continue;
+ /*
+ * Read the sibling leaf block.
+ */
+ if ((error =
+ xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
+ -1, &bp, XFS_DATA_FORK))) {
+ return error;
+ }
+ ASSERT(bp != NULL);
+ /*
+ * Count bytes in the two blocks combined.
+ */
+ leaf = (xfs_dir2_leaf_t *)info;
+ count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
+ bytes = state->blocksize - (state->blocksize >> 2);
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
+ bytes -= count * (uint)sizeof(leaf->ents[0]);
+ /*
+ * Fits with at least 25% to spare.
+ */
+ if (bytes >= 0)
+ break;
+ xfs_da_brelse(state->args->trans, bp);
+ }
+ /*
+ * Didn't like either block, give up.
+ */
+ if (i >= 2) {
+ *action = 0;
+ return 0;
+ }
+ /*
+ * Done with the sibling leaf block here, drop the dabuf
+ * so path_shift can get it.
+ */
+ xfs_da_buf_done(bp);
+ /*
+ * Make altpath point to the block we want to keep (the lower
+ * numbered block) and path point to the block we want to drop.
+ */
+ memcpy(&state->altpath, &state->path, sizeof(state->path));
+ if (blkno < blk->blkno)
+ error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+ &rval);
+ else
+ error = xfs_da_path_shift(state, &state->path, forward, 0,
+ &rval);
+ if (error) {
+ return error;
+ }
+ *action = rval ? 0 : 1;
+ return 0;
+}
+
+/*
+ * Move all the leaf entries from drop_blk to save_blk.
+ * This is done as part of a join operation.
+ */
+void
+xfs_dir2_leafn_unbalance(
+ xfs_da_state_t *state, /* cursor */
+ xfs_da_state_blk_t *drop_blk, /* dead block */
+ xfs_da_state_blk_t *save_blk) /* surviving block */
+{
+ xfs_da_args_t *args; /* operation arguments */
+ xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */
+ xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */
+
+ args = state->args;
+ ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ drop_leaf = drop_blk->bp->data;
+ save_leaf = save_blk->bp->data;
+ ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+ /*
+ * If there are any stale leaf entries, take this opportunity
+ * to purge them.
+ */
+ if (drop_leaf->hdr.stale)
+ xfs_dir2_leaf_compact(args, drop_blk->bp);
+ if (save_leaf->hdr.stale)
+ xfs_dir2_leaf_compact(args, save_blk->bp);
+ /*
+ * Move the entries from drop to the appropriate end of save.
+ */
+ drop_blk->hashval = be32_to_cpu(drop_leaf->ents[be16_to_cpu(drop_leaf->hdr.count) - 1].hashval);
+ if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
+ xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0,
+ be16_to_cpu(drop_leaf->hdr.count));
+ else
+ xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp,
+ be16_to_cpu(save_leaf->hdr.count), be16_to_cpu(drop_leaf->hdr.count));
+ save_blk->hashval = be32_to_cpu(save_leaf->ents[be16_to_cpu(save_leaf->hdr.count) - 1].hashval);
+ xfs_dir2_leafn_check(args->dp, save_blk->bp);
+}
+
+/*
+ * Top-level node form directory addname routine.
+ */
+int /* error */
+xfs_dir2_node_addname(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_da_state_blk_t *blk; /* leaf block for insert */
+ int error; /* error return value */
+ int rval; /* sub-return value */
+ xfs_da_state_t *state; /* btree cursor */
+
+ trace_xfs_dir2_node_addname(args);
+
+ /*
+ * Allocate and initialize the state (btree cursor).
+ */
+ state = xfs_da_state_alloc();
+ state->args = args;
+ state->mp = args->dp->i_mount;
+ state->blocksize = state->mp->m_dirblksize;
+ state->node_ents = state->mp->m_dir_node_ents;
+ /*
+ * Look up the name. We're not supposed to find it, but
+ * this gives us the insertion point.
+ */
+ error = xfs_da_node_lookup_int(state, &rval);
+ if (error)
+ rval = error;
+ if (rval != ENOENT) {
+ goto done;
+ }
+ /*
+ * Add the data entry to a data block.
+ * Extravalid is set to a freeblock found by lookup.
+ */
+ rval = xfs_dir2_node_addname_int(args,
+ state->extravalid ? &state->extrablk : NULL);
+ if (rval) {
+ goto done;
+ }
+ blk = &state->path.blk[state->path.active - 1];
+ ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ /*
+ * Add the new leaf entry.
+ */
+ rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+ if (rval == 0) {
+ /*
+ * It worked, fix the hash values up the btree.
+ */
+ if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
+ xfs_da_fixhashpath(state, &state->path);
+ } else {
+ /*
+ * It didn't work, we need to split the leaf block.
+ */
+ if (args->total == 0) {
+ ASSERT(rval == ENOSPC);
+ goto done;
+ }
+ /*
+ * Split the leaf block and insert the new entry.
+ */
+ rval = xfs_da_split(state);
+ }
+done:
+ xfs_da_state_free(state);
+ return rval;
+}
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int /* error */
+xfs_dir2_node_addname_int(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_da_state_blk_t *fblk) /* optional freespace block */
+{
+ xfs_dir2_data_t *data; /* data block structure */
+ xfs_dir2_db_t dbno; /* data block number */
+ xfs_dabuf_t *dbp; /* data block buffer */
+ xfs_dir2_data_entry_t *dep; /* data entry pointer */
+ xfs_inode_t *dp; /* incore directory inode */
+ xfs_dir2_data_unused_t *dup; /* data unused entry pointer */
+ int error; /* error return value */
+ xfs_dir2_db_t fbno; /* freespace block number */
+ xfs_dabuf_t *fbp; /* freespace buffer */
+ int findex; /* freespace entry index */
+ xfs_dir2_free_t *free=NULL; /* freespace block structure */
+ xfs_dir2_db_t ifbno; /* initial freespace block no */
+ xfs_dir2_db_t lastfbno=0; /* highest freespace block no */
+ int length; /* length of the new entry */
+ int logfree; /* need to log free entry */
+ xfs_mount_t *mp; /* filesystem mount point */
+ int needlog; /* need to log data header */
+ int needscan; /* need to rescan data frees */
+ __be16 *tagp; /* data entry tag pointer */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ tp = args->trans;
+ length = xfs_dir2_data_entsize(args->namelen);
+ /*
+ * If we came in with a freespace block that means that lookup
+ * found an entry with our hash value. This is the freespace
+ * block for that data entry.
+ */
+ if (fblk) {
+ fbp = fblk->bp;
+ /*
+ * Remember initial freespace block number.
+ */
+ ifbno = fblk->blkno;
+ free = fbp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+ findex = fblk->index;
+ /*
+ * This means the free entry showed that the data block had
+ * space for our entry, so we remembered it.
+ * Use that data block.
+ */
+ if (findex >= 0) {
+ ASSERT(findex < be32_to_cpu(free->hdr.nvalid));
+ ASSERT(be16_to_cpu(free->bests[findex]) != NULLDATAOFF);
+ ASSERT(be16_to_cpu(free->bests[findex]) >= length);
+ dbno = be32_to_cpu(free->hdr.firstdb) + findex;
+ }
+ /*
+ * The data block looked at didn't have enough room.
+ * We'll start at the beginning of the freespace entries.
+ */
+ else {
+ dbno = -1;
+ findex = 0;
+ }
+ }
+ /*
+ * Didn't come in with a freespace block, so don't have a data block.
+ */
+ else {
+ ifbno = dbno = -1;
+ fbp = NULL;
+ findex = 0;
+ }
+ /*
+ * If we don't have a data block yet, we're going to scan the
+ * freespace blocks looking for one. Figure out what the
+ * highest freespace block number is.
+ */
+ if (dbno == -1) {
+ xfs_fileoff_t fo; /* freespace block number */
+
+ if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
+ return error;
+ lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo);
+ fbno = ifbno;
+ }
+ /*
+ * While we haven't identified a data block, search the freeblock
+ * data for a good data block. If we find a null freeblock entry,
+ * indicating a hole in the data blocks, remember that.
+ */
+ while (dbno == -1) {
+ /*
+ * If we don't have a freeblock in hand, get the next one.
+ */
+ if (fbp == NULL) {
+ /*
+ * Happens the first time through unless lookup gave
+ * us a freespace block to start with.
+ */
+ if (++fbno == 0)
+ fbno = XFS_DIR2_FREE_FIRSTDB(mp);
+ /*
+ * If it's ifbno we already looked at it.
+ */
+ if (fbno == ifbno)
+ fbno++;
+ /*
+ * If it's off the end we're done.
+ */
+ if (fbno >= lastfbno)
+ break;
+ /*
+ * Read the block. There can be holes in the
+ * freespace blocks, so this might not succeed.
+ * This should be really rare, so there's no reason
+ * to avoid it.
+ */
+ if ((error = xfs_da_read_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
+ XFS_DATA_FORK))) {
+ return error;
+ }
+ if (unlikely(fbp == NULL)) {
+ continue;
+ }
+ free = fbp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+ findex = 0;
+ }
+ /*
+ * Look at the current free entry. Is it good enough?
+ */
+ if (be16_to_cpu(free->bests[findex]) != NULLDATAOFF &&
+ be16_to_cpu(free->bests[findex]) >= length)
+ dbno = be32_to_cpu(free->hdr.firstdb) + findex;
+ else {
+ /*
+ * Are we done with the freeblock?
+ */
+ if (++findex == be32_to_cpu(free->hdr.nvalid)) {
+ /*
+ * Drop the block.
+ */
+ xfs_da_brelse(tp, fbp);
+ fbp = NULL;
+ if (fblk && fblk->bp)
+ fblk->bp = NULL;
+ }
+ }
+ }
+ /*
+ * If we don't have a data block, we need to allocate one and make
+ * the freespace entries refer to it.
+ */
+ if (unlikely(dbno == -1)) {
+ /*
+ * Not allowed to allocate, return failure.
+ */
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+ args->total == 0) {
+ /*
+ * Drop the freespace buffer unless it came from our
+ * caller.
+ */
+ if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+ xfs_da_buf_done(fbp);
+ return XFS_ERROR(ENOSPC);
+ }
+ /*
+ * Allocate and initialize the new data block.
+ */
+ if (unlikely((error = xfs_dir2_grow_inode(args,
+ XFS_DIR2_DATA_SPACE,
+ &dbno)) ||
+ (error = xfs_dir2_data_init(args, dbno, &dbp)))) {
+ /*
+ * Drop the freespace buffer unless it came from our
+ * caller.
+ */
+ if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+ xfs_da_buf_done(fbp);
+ return error;
+ }
+ /*
+ * If (somehow) we have a freespace block, get rid of it.
+ */
+ if (fbp)
+ xfs_da_brelse(tp, fbp);
+ if (fblk && fblk->bp)
+ fblk->bp = NULL;
+
+ /*
+ * Get the freespace block corresponding to the data block
+ * that was just allocated.
+ */
+ fbno = xfs_dir2_db_to_fdb(mp, dbno);
+ if (unlikely(error = xfs_da_read_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
+ XFS_DATA_FORK))) {
+ xfs_da_buf_done(dbp);
+ return error;
+ }
+ /*
+ * If there wasn't a freespace block, the read will
+ * return a NULL fbp. Allocate and initialize a new one.
+ */
+ if( fbp == NULL ) {
+ if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+ &fbno))) {
+ return error;
+ }
+
+ if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
+ xfs_alert(mp,
+ "%s: dir ino " "%llu needed freesp block %lld for\n"
+ " data block %lld, got %lld ifbno %llu lastfbno %d",
+ __func__, (unsigned long long)dp->i_ino,
+ (long long)xfs_dir2_db_to_fdb(mp, dbno),
+ (long long)dbno, (long long)fbno,
+ (unsigned long long)ifbno, lastfbno);
+ if (fblk) {
+ xfs_alert(mp,
+ " fblk 0x%p blkno %llu index %d magic 0x%x",
+ fblk,
+ (unsigned long long)fblk->blkno,
+ fblk->index,
+ fblk->magic);
+ } else {
+ xfs_alert(mp, " ... fblk is NULL");
+ }
+ XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
+ XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ /*
+ * Get a buffer for the new block.
+ */
+ if ((error = xfs_da_get_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, fbno),
+ -1, &fbp, XFS_DATA_FORK))) {
+ return error;
+ }
+ ASSERT(fbp != NULL);
+
+ /*
+ * Initialize the new block to be empty, and remember
+ * its first slot as our empty slot.
+ */
+ free = fbp->data;
+ free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC);
+ free->hdr.firstdb = cpu_to_be32(
+ (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
+ XFS_DIR2_MAX_FREE_BESTS(mp));
+ free->hdr.nvalid = 0;
+ free->hdr.nused = 0;
+ } else {
+ free = fbp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+ }
+
+ /*
+ * Set the freespace block index from the data block number.
+ */
+ findex = xfs_dir2_db_to_fdindex(mp, dbno);
+ /*
+ * If it's after the end of the current entries in the
+ * freespace block, extend that table.
+ */
+ if (findex >= be32_to_cpu(free->hdr.nvalid)) {
+ ASSERT(findex < XFS_DIR2_MAX_FREE_BESTS(mp));
+ free->hdr.nvalid = cpu_to_be32(findex + 1);
+ /*
+ * Tag new entry so nused will go up.
+ */
+ free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+ }
+ /*
+ * If this entry was for an empty data block
+ * (this should always be true) then update the header.
+ */
+ if (be16_to_cpu(free->bests[findex]) == NULLDATAOFF) {
+ be32_add_cpu(&free->hdr.nused, 1);
+ xfs_dir2_free_log_header(tp, fbp);
+ }
+ /*
+ * Update the real value in the table.
+ * We haven't allocated the data entry yet so this will
+ * change again.
+ */
+ data = dbp->data;
+ free->bests[findex] = data->hdr.bestfree[0].length;
+ logfree = 1;
+ }
+ /*
+ * We had a data block so we don't have to make a new one.
+ */
+ else {
+ /*
+ * If just checking, we succeeded.
+ */
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+ if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+ xfs_da_buf_done(fbp);
+ return 0;
+ }
+ /*
+ * Read the data block in.
+ */
+ if (unlikely(
+ error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+ -1, &dbp, XFS_DATA_FORK))) {
+ if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+ xfs_da_buf_done(fbp);
+ return error;
+ }
+ data = dbp->data;
+ logfree = 0;
+ }
+ ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) >= length);
+ /*
+ * Point to the existing unused space.
+ */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)data + be16_to_cpu(data->hdr.bestfree[0].offset));
+ needscan = needlog = 0;
+ /*
+ * Mark the first part of the unused space, inuse for us.
+ */
+ xfs_dir2_data_use_free(tp, dbp, dup,
+ (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+ &needlog, &needscan);
+ /*
+ * Fill in the new entry and log it.
+ */
+ dep = (xfs_dir2_data_entry_t *)dup;
+ dep->inumber = cpu_to_be64(args->inumber);
+ dep->namelen = args->namelen;
+ memcpy(dep->name, args->name, dep->namelen);
+ tagp = xfs_dir2_data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)data);
+ xfs_dir2_data_log_entry(tp, dbp, dep);
+ /*
+ * Rescan the block for bestfree if needed.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(mp, data, &needlog);
+ /*
+ * Log the data block header if needed.
+ */
+ if (needlog)
+ xfs_dir2_data_log_header(tp, dbp);
+ /*
+ * If the freespace entry is now wrong, update it.
+ */
+ if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(data->hdr.bestfree[0].length)) {
+ free->bests[findex] = data->hdr.bestfree[0].length;
+ logfree = 1;
+ }
+ /*
+ * Log the freespace entry if needed.
+ */
+ if (logfree)
+ xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+ /*
+ * If the caller didn't hand us the freespace block, drop it.
+ */
+ if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+ xfs_da_buf_done(fbp);
+ /*
+ * Return the data block and offset in args, then drop the data block.
+ */
+ args->blkno = (xfs_dablk_t)dbno;
+ args->index = be16_to_cpu(*tagp);
+ xfs_da_buf_done(dbp);
+ return 0;
+}
+
+/*
+ * Lookup an entry in a node-format directory.
+ * All the real work happens in xfs_da_node_lookup_int.
+ * The only real output is the inode number of the entry.
+ */
+int /* error */
+xfs_dir2_node_lookup(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ int error; /* error return value */
+ int i; /* btree level */
+ int rval; /* operation return value */
+ xfs_da_state_t *state; /* btree cursor */
+
+ trace_xfs_dir2_node_lookup(args);
+
+ /*
+ * Allocate and initialize the btree cursor.
+ */
+ state = xfs_da_state_alloc();
+ state->args = args;
+ state->mp = args->dp->i_mount;
+ state->blocksize = state->mp->m_dirblksize;
+ state->node_ents = state->mp->m_dir_node_ents;
+ /*
+ * Fill in the path to the entry in the cursor.
+ */
+ error = xfs_da_node_lookup_int(state, &rval);
+ if (error)
+ rval = error;
+ else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
+ /* If a CI match, dup the actual name and return EEXIST */
+ xfs_dir2_data_entry_t *dep;
+
+ dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp->
+ data + state->extrablk.index);
+ rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+ }
+ /*
+ * Release the btree blocks and leaf block.
+ */
+ for (i = 0; i < state->path.active; i++) {
+ xfs_da_brelse(args->trans, state->path.blk[i].bp);
+ state->path.blk[i].bp = NULL;
+ }
+ /*
+ * Release the data block if we have it.
+ */
+ if (state->extravalid && state->extrablk.bp) {
+ xfs_da_brelse(args->trans, state->extrablk.bp);
+ state->extrablk.bp = NULL;
+ }
+ xfs_da_state_free(state);
+ return rval;
+}
+
+/*
+ * Remove an entry from a node-format directory.
+ */
+int /* error */
+xfs_dir2_node_removename(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_da_state_blk_t *blk; /* leaf block */
+ int error; /* error return value */
+ int rval; /* operation return value */
+ xfs_da_state_t *state; /* btree cursor */
+
+ trace_xfs_dir2_node_removename(args);
+
+ /*
+ * Allocate and initialize the btree cursor.
+ */
+ state = xfs_da_state_alloc();
+ state->args = args;
+ state->mp = args->dp->i_mount;
+ state->blocksize = state->mp->m_dirblksize;
+ state->node_ents = state->mp->m_dir_node_ents;
+ /*
+ * Look up the entry we're deleting, set up the cursor.
+ */
+ error = xfs_da_node_lookup_int(state, &rval);
+ if (error)
+ rval = error;
+ /*
+ * Didn't find it, upper layer screwed up.
+ */
+ if (rval != EEXIST) {
+ xfs_da_state_free(state);
+ return rval;
+ }
+ blk = &state->path.blk[state->path.active - 1];
+ ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ ASSERT(state->extravalid);
+ /*
+ * Remove the leaf and data entries.
+ * Extrablk refers to the data block.
+ */
+ error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
+ &state->extrablk, &rval);
+ if (error)
+ return error;
+ /*
+ * Fix the hash values up the btree.
+ */
+ xfs_da_fixhashpath(state, &state->path);
+ /*
+ * If we need to join leaf blocks, do it.
+ */
+ if (rval && state->path.active > 1)
+ error = xfs_da_join(state);
+ /*
+ * If no errors so far, try conversion to leaf format.
+ */
+ if (!error)
+ error = xfs_dir2_node_to_leaf(state);
+ xfs_da_state_free(state);
+ return error;
+}
+
+/*
+ * Replace an entry's inode number in a node-format directory.
+ */
+int /* error */
+xfs_dir2_node_replace(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_da_state_blk_t *blk; /* leaf block */
+ xfs_dir2_data_t *data; /* data block structure */
+ xfs_dir2_data_entry_t *dep; /* data entry changed */
+ int error; /* error return value */
+ int i; /* btree level */
+ xfs_ino_t inum; /* new inode number */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */
+ int rval; /* internal return value */
+ xfs_da_state_t *state; /* btree cursor */
+
+ trace_xfs_dir2_node_replace(args);
+
+ /*
+ * Allocate and initialize the btree cursor.
+ */
+ state = xfs_da_state_alloc();
+ state->args = args;
+ state->mp = args->dp->i_mount;
+ state->blocksize = state->mp->m_dirblksize;
+ state->node_ents = state->mp->m_dir_node_ents;
+ inum = args->inumber;
+ /*
+ * Lookup the entry to change in the btree.
+ */
+ error = xfs_da_node_lookup_int(state, &rval);
+ if (error) {
+ rval = error;
+ }
+ /*
+ * It should be found, since the vnodeops layer has looked it up
+ * and locked it. But paranoia is good.
+ */
+ if (rval == EEXIST) {
+ /*
+ * Find the leaf entry.
+ */
+ blk = &state->path.blk[state->path.active - 1];
+ ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+ leaf = blk->bp->data;
+ lep = &leaf->ents[blk->index];
+ ASSERT(state->extravalid);
+ /*
+ * Point to the data entry.
+ */
+ data = state->extrablk.bp->data;
+ ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC);
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)data +
+ xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address)));
+ ASSERT(inum != be64_to_cpu(dep->inumber));
+ /*
+ * Fill in the new inode number and log the entry.
+ */
+ dep->inumber = cpu_to_be64(inum);
+ xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
+ rval = 0;
+ }
+ /*
+ * Didn't find it, and we're holding a data block. Drop it.
+ */
+ else if (state->extravalid) {
+ xfs_da_brelse(args->trans, state->extrablk.bp);
+ state->extrablk.bp = NULL;
+ }
+ /*
+ * Release all the buffers in the cursor.
+ */
+ for (i = 0; i < state->path.active; i++) {
+ xfs_da_brelse(args->trans, state->path.blk[i].bp);
+ state->path.blk[i].bp = NULL;
+ }
+ xfs_da_state_free(state);
+ return rval;
+}
+
+/*
+ * Trim off a trailing empty freespace block.
+ * Return (in rvalp) 1 if we did it, 0 if not.
+ */
+int /* error */
+xfs_dir2_node_trim_free(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_fileoff_t fo, /* free block number */
+ int *rvalp) /* out: did something */
+{
+ xfs_dabuf_t *bp; /* freespace buffer */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return code */
+ xfs_dir2_free_t *free; /* freespace structure */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_trans_t *tp; /* transaction pointer */
+
+ dp = args->dp;
+ mp = dp->i_mount;
+ tp = args->trans;
+ /*
+ * Read the freespace block.
+ */
+ if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
+ XFS_DATA_FORK))) {
+ return error;
+ }
+
+ /*
+ * There can be holes in freespace. If fo is a hole, there's
+ * nothing to do.
+ */
+ if (bp == NULL) {
+ return 0;
+ }
+ free = bp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
+ /*
+ * If there are used entries, there's nothing to do.
+ */
+ if (be32_to_cpu(free->hdr.nused) > 0) {
+ xfs_da_brelse(tp, bp);
+ *rvalp = 0;
+ return 0;
+ }
+ /*
+ * Blow the block away.
+ */
+ if ((error =
+ xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo),
+ bp))) {
+ /*
+ * Can't fail with ENOSPC since that only happens with no
+ * space reservation, when breaking up an extent into two
+ * pieces. This is the last block of an extent.
+ */
+ ASSERT(error != ENOSPC);
+ xfs_da_brelse(tp, bp);
+ return error;
+ }
+ /*
+ * Return that we succeeded.
+ */
+ *rvalp = 1;
+ return 0;
+}
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
new file mode 100644
index 00000000..82dfe714
--- /dev/null
+++ b/fs/xfs/xfs_dir2_node.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DIR2_NODE_H__
+#define __XFS_DIR2_NODE_H__
+
+/*
+ * Directory version 2, btree node format structures
+ */
+
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Offset of the freespace index.
+ */
+#define XFS_DIR2_FREE_SPACE 2
+#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_FREE_FIRSTDB(mp) \
+ xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET)
+
+#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F */
+
+typedef struct xfs_dir2_free_hdr {
+ __be32 magic; /* XFS_DIR2_FREE_MAGIC */
+ __be32 firstdb; /* db of first entry */
+ __be32 nvalid; /* count of valid entries */
+ __be32 nused; /* count of used entries */
+} xfs_dir2_free_hdr_t;
+
+typedef struct xfs_dir2_free {
+ xfs_dir2_free_hdr_t hdr; /* block header */
+ __be16 bests[1]; /* best free counts */
+ /* unused entries are -1 */
+} xfs_dir2_free_t;
+
+#define XFS_DIR2_MAX_FREE_BESTS(mp) \
+ (((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_free_hdr_t)) / \
+ (uint)sizeof(xfs_dir2_data_off_t))
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+ return (XFS_DIR2_FREE_FIRSTDB(mp) + (db) / XFS_DIR2_MAX_FREE_BESTS(mp));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static inline int
+xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
+{
+ return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp));
+}
+
+extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
+ struct xfs_dabuf *lbp);
+extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
+extern int xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp,
+ struct xfs_da_args *args, int *indexp,
+ struct xfs_da_state *state);
+extern int xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp,
+ struct xfs_dabuf *leaf2_bp);
+extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
+ struct xfs_da_state_blk *oldblk,
+ struct xfs_da_state_blk *newblk);
+extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+ struct xfs_da_state_blk *drop_blk,
+ struct xfs_da_state_blk *save_blk);
+extern int xfs_dir2_node_addname(struct xfs_da_args *args);
+extern int xfs_dir2_node_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_node_removename(struct xfs_da_args *args);
+extern int xfs_dir2_node_replace(struct xfs_da_args *args);
+extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+ int *rvalp);
+
+#endif /* __XFS_DIR2_NODE_H__ */
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
new file mode 100644
index 00000000..b1bae6b1
--- /dev/null
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -0,0 +1,1267 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_error.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_trace.h"
+
+/*
+ * Prototypes for internal functions.
+ */
+static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
+ xfs_dir2_sf_entry_t *sfep,
+ xfs_dir2_data_aoff_t offset,
+ int new_isize);
+static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
+ int new_isize);
+static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
+ xfs_dir2_sf_entry_t **sfepp,
+ xfs_dir2_data_aoff_t *offsetp);
+#ifdef DEBUG
+static void xfs_dir2_sf_check(xfs_da_args_t *args);
+#else
+#define xfs_dir2_sf_check(args)
+#endif /* DEBUG */
+#if XFS_BIG_INUMS
+static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
+static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+#endif /* XFS_BIG_INUMS */
+
+/*
+ * Given a block directory (dp/block), calculate its size as a shortform (sf)
+ * directory and a header for the sf directory, if it will fit it the
+ * space currently present in the inode. If it won't fit, the output
+ * size is too big (but not accurate).
+ */
+int /* size for sf form */
+xfs_dir2_block_sfsize(
+ xfs_inode_t *dp, /* incore inode pointer */
+ xfs_dir2_block_t *block, /* block directory data */
+ xfs_dir2_sf_hdr_t *sfhp) /* output: header for sf form */
+{
+ xfs_dir2_dataptr_t addr; /* data entry address */
+ xfs_dir2_leaf_entry_t *blp; /* leaf area of the block */
+ xfs_dir2_block_tail_t *btp; /* tail area of the block */
+ int count; /* shortform entry count */
+ xfs_dir2_data_entry_t *dep; /* data entry in the block */
+ int i; /* block entry index */
+ int i8count; /* count of big-inode entries */
+ int isdot; /* entry is "." */
+ int isdotdot; /* entry is ".." */
+ xfs_mount_t *mp; /* mount structure pointer */
+ int namelen; /* total name bytes */
+ xfs_ino_t parent = 0; /* parent inode number */
+ int size=0; /* total computed size */
+
+ mp = dp->i_mount;
+
+ count = i8count = namelen = 0;
+ btp = xfs_dir2_block_tail_p(mp, block);
+ blp = xfs_dir2_block_leaf_p(btp);
+
+ /*
+ * Iterate over the block's data entries by using the leaf pointers.
+ */
+ for (i = 0; i < be32_to_cpu(btp->count); i++) {
+ if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Calculate the pointer to the entry at hand.
+ */
+ dep = (xfs_dir2_data_entry_t *)
+ ((char *)block + xfs_dir2_dataptr_to_off(mp, addr));
+ /*
+ * Detect . and .., so we can special-case them.
+ * . is not included in sf directories.
+ * .. is included by just the parent inode number.
+ */
+ isdot = dep->namelen == 1 && dep->name[0] == '.';
+ isdotdot =
+ dep->namelen == 2 &&
+ dep->name[0] == '.' && dep->name[1] == '.';
+#if XFS_BIG_INUMS
+ if (!isdot)
+ i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
+#endif
+ if (!isdot && !isdotdot) {
+ count++;
+ namelen += dep->namelen;
+ } else if (isdotdot)
+ parent = be64_to_cpu(dep->inumber);
+ /*
+ * Calculate the new size, see if we should give up yet.
+ */
+ size = xfs_dir2_sf_hdr_size(i8count) + /* header */
+ count + /* namelen */
+ count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
+ namelen + /* name */
+ (i8count ? /* inumber */
+ (uint)sizeof(xfs_dir2_ino8_t) * count :
+ (uint)sizeof(xfs_dir2_ino4_t) * count);
+ if (size > XFS_IFORK_DSIZE(dp))
+ return size; /* size value is a failure */
+ }
+ /*
+ * Create the output header, if it worked.
+ */
+ sfhp->count = count;
+ sfhp->i8count = i8count;
+ xfs_dir2_sf_put_inumber((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent);
+ return size;
+}
+
+/*
+ * Convert a block format directory to shortform.
+ * Caller has already checked that it will fit, and built us a header.
+ */
+int /* error */
+xfs_dir2_block_to_sf(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dabuf_t *bp, /* block buffer */
+ int size, /* shortform directory size */
+ xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */
+{
+ xfs_dir2_block_t *block; /* block structure */
+ xfs_dir2_block_tail_t *btp; /* block tail pointer */
+ xfs_dir2_data_entry_t *dep; /* data entry pointer */
+ xfs_inode_t *dp; /* incore directory inode */
+ xfs_dir2_data_unused_t *dup; /* unused data pointer */
+ char *endptr; /* end of data entries */
+ int error; /* error return value */
+ int logflags; /* inode logging flags */
+ xfs_mount_t *mp; /* filesystem mount point */
+ char *ptr; /* current data pointer */
+ xfs_dir2_sf_entry_t *sfep; /* shortform entry */
+ xfs_dir2_sf_t *sfp; /* shortform structure */
+ xfs_ino_t temp;
+
+ trace_xfs_dir2_block_to_sf(args);
+
+ dp = args->dp;
+ mp = dp->i_mount;
+
+ /*
+ * Make a copy of the block data, so we can shrink the inode
+ * and add local data.
+ */
+ block = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
+ memcpy(block, bp->data, mp->m_dirblksize);
+ logflags = XFS_ILOG_CORE;
+ if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
+ ASSERT(error != ENOSPC);
+ goto out;
+ }
+ /*
+ * The buffer is now unconditionally gone, whether
+ * xfs_dir2_shrink_inode worked or not.
+ *
+ * Convert the inode to local format.
+ */
+ dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+ dp->i_df.if_flags |= XFS_IFINLINE;
+ dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ ASSERT(dp->i_df.if_bytes == 0);
+ xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+ logflags |= XFS_ILOG_DDATA;
+ /*
+ * Copy the header into the newly allocate local space.
+ */
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
+ dp->i_d.di_size = size;
+ /*
+ * Set up to loop over the block's entries.
+ */
+ btp = xfs_dir2_block_tail_p(mp, block);
+ ptr = (char *)block->u;
+ endptr = (char *)xfs_dir2_block_leaf_p(btp);
+ sfep = xfs_dir2_sf_firstentry(sfp);
+ /*
+ * Loop over the active and unused entries.
+ * Stop when we reach the leaf/tail portion of the block.
+ */
+ while (ptr < endptr) {
+ /*
+ * If it's unused, just skip over it.
+ */
+ dup = (xfs_dir2_data_unused_t *)ptr;
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ ptr += be16_to_cpu(dup->length);
+ continue;
+ }
+ dep = (xfs_dir2_data_entry_t *)ptr;
+ /*
+ * Skip .
+ */
+ if (dep->namelen == 1 && dep->name[0] == '.')
+ ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
+ /*
+ * Skip .., but make sure the inode number is right.
+ */
+ else if (dep->namelen == 2 &&
+ dep->name[0] == '.' && dep->name[1] == '.')
+ ASSERT(be64_to_cpu(dep->inumber) ==
+ xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent));
+ /*
+ * Normal entry, copy it into shortform.
+ */
+ else {
+ sfep->namelen = dep->namelen;
+ xfs_dir2_sf_put_offset(sfep,
+ (xfs_dir2_data_aoff_t)
+ ((char *)dep - (char *)block));
+ memcpy(sfep->name, dep->name, dep->namelen);
+ temp = be64_to_cpu(dep->inumber);
+ xfs_dir2_sf_put_inumber(sfp, &temp,
+ xfs_dir2_sf_inumberp(sfep));
+ sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+ }
+ ptr += xfs_dir2_data_entsize(dep->namelen);
+ }
+ ASSERT((char *)sfep - (char *)sfp == size);
+ xfs_dir2_sf_check(args);
+out:
+ xfs_trans_log_inode(args->trans, dp, logflags);
+ kmem_free(block);
+ return error;
+}
+
+/*
+ * Add a name to a shortform directory.
+ * There are two algorithms, "easy" and "hard" which we decide on
+ * before changing anything.
+ * Convert to block form if necessary, if the new entry won't fit.
+ */
+int /* error */
+xfs_dir2_sf_addname(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ int add_entsize; /* size of the new entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ int incr_isize; /* total change in size */
+ int new_isize; /* di_size after adding name */
+ int objchange; /* changing to 8-byte inodes */
+ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */
+ int old_isize; /* di_size before adding name */
+ int pick; /* which algorithm to use */
+ xfs_dir2_sf_t *sfp; /* shortform structure */
+ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */
+
+ trace_xfs_dir2_sf_addname(args);
+
+ ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
+ dp = args->dp;
+ ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ /*
+ * Make sure the shortform value has some of its header.
+ */
+ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+ ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+ return XFS_ERROR(EIO);
+ }
+ ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+ /*
+ * Compute entry (and change in) size.
+ */
+ add_entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen);
+ incr_isize = add_entsize;
+ objchange = 0;
+#if XFS_BIG_INUMS
+ /*
+ * Do we have to change to 8 byte inodes?
+ */
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+ /*
+ * Yes, adjust the entry size and the total size.
+ */
+ add_entsize +=
+ (uint)sizeof(xfs_dir2_ino8_t) -
+ (uint)sizeof(xfs_dir2_ino4_t);
+ incr_isize +=
+ (sfp->hdr.count + 2) *
+ ((uint)sizeof(xfs_dir2_ino8_t) -
+ (uint)sizeof(xfs_dir2_ino4_t));
+ objchange = 1;
+ }
+#endif
+ old_isize = (int)dp->i_d.di_size;
+ new_isize = old_isize + incr_isize;
+ /*
+ * Won't fit as shortform any more (due to size),
+ * or the pick routine says it won't (due to offset values).
+ */
+ if (new_isize > XFS_IFORK_DSIZE(dp) ||
+ (pick =
+ xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
+ /*
+ * Just checking or no space reservation, it doesn't fit.
+ */
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+ return XFS_ERROR(ENOSPC);
+ /*
+ * Convert to block form then add the name.
+ */
+ error = xfs_dir2_sf_to_block(args);
+ if (error)
+ return error;
+ return xfs_dir2_block_addname(args);
+ }
+ /*
+ * Just checking, it fits.
+ */
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+ return 0;
+ /*
+ * Do it the easy way - just add it at the end.
+ */
+ if (pick == 1)
+ xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
+ /*
+ * Do it the hard way - look for a place to insert the new entry.
+ * Convert to 8 byte inode numbers first if necessary.
+ */
+ else {
+ ASSERT(pick == 2);
+#if XFS_BIG_INUMS
+ if (objchange)
+ xfs_dir2_sf_toino8(args);
+#endif
+ xfs_dir2_sf_addname_hard(args, objchange, new_isize);
+ }
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ return 0;
+}
+
+/*
+ * Add the new entry the "easy" way.
+ * This is copying the old directory and adding the new entry at the end.
+ * Since it's sorted by "offset" we need room after the last offset
+ * that's already there, and then room to convert to a block directory.
+ * This is already checked by the pick routine.
+ */
+static void
+xfs_dir2_sf_addname_easy(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_dir2_sf_entry_t *sfep, /* pointer to new entry */
+ xfs_dir2_data_aoff_t offset, /* offset to use for new ent */
+ int new_isize) /* new directory size */
+{
+ int byteoff; /* byte offset in sf dir */
+ xfs_inode_t *dp; /* incore directory inode */
+ xfs_dir2_sf_t *sfp; /* shortform structure */
+
+ dp = args->dp;
+
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ byteoff = (int)((char *)sfep - (char *)sfp);
+ /*
+ * Grow the in-inode space.
+ */
+ xfs_idata_realloc(dp, xfs_dir2_sf_entsize_byname(sfp, args->namelen),
+ XFS_DATA_FORK);
+ /*
+ * Need to set up again due to realloc of the inode data.
+ */
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
+ /*
+ * Fill in the new entry.
+ */
+ sfep->namelen = args->namelen;
+ xfs_dir2_sf_put_offset(sfep, offset);
+ memcpy(sfep->name, args->name, sfep->namelen);
+ xfs_dir2_sf_put_inumber(sfp, &args->inumber,
+ xfs_dir2_sf_inumberp(sfep));
+ /*
+ * Update the header and inode.
+ */
+ sfp->hdr.count++;
+#if XFS_BIG_INUMS
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
+ sfp->hdr.i8count++;
+#endif
+ dp->i_d.di_size = new_isize;
+ xfs_dir2_sf_check(args);
+}
+
+/*
+ * Add the new entry the "hard" way.
+ * The caller has already converted to 8 byte inode numbers if necessary,
+ * in which case we need to leave the i8count at 1.
+ * Find a hole that the new entry will fit into, and copy
+ * the first part of the entries, the new entry, and the last part of
+ * the entries.
+ */
+/* ARGSUSED */
+static void
+xfs_dir2_sf_addname_hard(
+ xfs_da_args_t *args, /* operation arguments */
+ int objchange, /* changing inode number size */
+ int new_isize) /* new directory size */
+{
+ int add_datasize; /* data size need for new ent */
+ char *buf; /* buffer for old */
+ xfs_inode_t *dp; /* incore directory inode */
+ int eof; /* reached end of old dir */
+ int nbytes; /* temp for byte copies */
+ xfs_dir2_data_aoff_t new_offset; /* next offset value */
+ xfs_dir2_data_aoff_t offset; /* current offset value */
+ int old_isize; /* previous di_size */
+ xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */
+ xfs_dir2_sf_t *oldsfp; /* original shortform dir */
+ xfs_dir2_sf_entry_t *sfep; /* entry in new dir */
+ xfs_dir2_sf_t *sfp; /* new shortform dir */
+
+ /*
+ * Copy the old directory to the stack buffer.
+ */
+ dp = args->dp;
+
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ old_isize = (int)dp->i_d.di_size;
+ buf = kmem_alloc(old_isize, KM_SLEEP);
+ oldsfp = (xfs_dir2_sf_t *)buf;
+ memcpy(oldsfp, sfp, old_isize);
+ /*
+ * Loop over the old directory finding the place we're going
+ * to insert the new entry.
+ * If it's going to end up at the end then oldsfep will point there.
+ */
+ for (offset = XFS_DIR2_DATA_FIRST_OFFSET,
+ oldsfep = xfs_dir2_sf_firstentry(oldsfp),
+ add_datasize = xfs_dir2_data_entsize(args->namelen),
+ eof = (char *)oldsfep == &buf[old_isize];
+ !eof;
+ offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen),
+ oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep),
+ eof = (char *)oldsfep == &buf[old_isize]) {
+ new_offset = xfs_dir2_sf_get_offset(oldsfep);
+ if (offset + add_datasize <= new_offset)
+ break;
+ }
+ /*
+ * Get rid of the old directory, then allocate space for
+ * the new one. We do this so xfs_idata_realloc won't copy
+ * the data.
+ */
+ xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
+ xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+ /*
+ * Reset the pointer since the buffer was reallocated.
+ */
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ /*
+ * Copy the first part of the directory, including the header.
+ */
+ nbytes = (int)((char *)oldsfep - (char *)oldsfp);
+ memcpy(sfp, oldsfp, nbytes);
+ sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
+ /*
+ * Fill in the new entry, and update the header counts.
+ */
+ sfep->namelen = args->namelen;
+ xfs_dir2_sf_put_offset(sfep, offset);
+ memcpy(sfep->name, args->name, sfep->namelen);
+ xfs_dir2_sf_put_inumber(sfp, &args->inumber,
+ xfs_dir2_sf_inumberp(sfep));
+ sfp->hdr.count++;
+#if XFS_BIG_INUMS
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
+ sfp->hdr.i8count++;
+#endif
+ /*
+ * If there's more left to copy, do that.
+ */
+ if (!eof) {
+ sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+ memcpy(sfep, oldsfep, old_isize - nbytes);
+ }
+ kmem_free(buf);
+ dp->i_d.di_size = new_isize;
+ xfs_dir2_sf_check(args);
+}
+
+/*
+ * Decide if the new entry will fit at all.
+ * If it will fit, pick between adding the new entry to the end (easy)
+ * or somewhere else (hard).
+ * Return 0 (won't fit), 1 (easy), 2 (hard).
+ */
+/*ARGSUSED*/
+static int /* pick result */
+xfs_dir2_sf_addname_pick(
+ xfs_da_args_t *args, /* operation arguments */
+ int objchange, /* inode # size changes */
+ xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */
+ xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */
+{
+ xfs_inode_t *dp; /* incore directory inode */
+ int holefit; /* found hole it will fit in */
+ int i; /* entry number */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_data_aoff_t offset; /* data block offset */
+ xfs_dir2_sf_entry_t *sfep; /* shortform entry */
+ xfs_dir2_sf_t *sfp; /* shortform structure */
+ int size; /* entry's data size */
+ int used; /* data bytes used */
+
+ dp = args->dp;
+ mp = dp->i_mount;
+
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ size = xfs_dir2_data_entsize(args->namelen);
+ offset = XFS_DIR2_DATA_FIRST_OFFSET;
+ sfep = xfs_dir2_sf_firstentry(sfp);
+ holefit = 0;
+ /*
+ * Loop over sf entries.
+ * Keep track of data offset and whether we've seen a place
+ * to insert the new entry.
+ */
+ for (i = 0; i < sfp->hdr.count; i++) {
+ if (!holefit)
+ holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
+ offset = xfs_dir2_sf_get_offset(sfep) +
+ xfs_dir2_data_entsize(sfep->namelen);
+ sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+ }
+ /*
+ * Calculate data bytes used excluding the new entry, if this
+ * was a data block (block form directory).
+ */
+ used = offset +
+ (sfp->hdr.count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+ (uint)sizeof(xfs_dir2_block_tail_t);
+ /*
+ * If it won't fit in a block form then we can't insert it,
+ * we'll go back, convert to block, then try the insert and convert
+ * to leaf.
+ */
+ if (used + (holefit ? 0 : size) > mp->m_dirblksize)
+ return 0;
+ /*
+ * If changing the inode number size, do it the hard way.
+ */
+#if XFS_BIG_INUMS
+ if (objchange) {
+ return 2;
+ }
+#else
+ ASSERT(objchange == 0);
+#endif
+ /*
+ * If it won't fit at the end then do it the hard way (use the hole).
+ */
+ if (used + size > mp->m_dirblksize)
+ return 2;
+ /*
+ * Do it the easy way.
+ */
+ *sfepp = sfep;
+ *offsetp = offset;
+ return 1;
+}
+
+#ifdef DEBUG
+/*
+ * Check consistency of shortform directory, assert if bad.
+ */
+static void
+xfs_dir2_sf_check(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_inode_t *dp; /* incore directory inode */
+ int i; /* entry number */
+ int i8count; /* number of big inode#s */
+ xfs_ino_t ino; /* entry inode number */
+ int offset; /* data offset */
+ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
+ xfs_dir2_sf_t *sfp; /* shortform structure */
+
+ dp = args->dp;
+
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ offset = XFS_DIR2_DATA_FIRST_OFFSET;
+ ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
+ i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+ i < sfp->hdr.count;
+ i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+ ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
+ ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
+ i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+ offset =
+ xfs_dir2_sf_get_offset(sfep) +
+ xfs_dir2_data_entsize(sfep->namelen);
+ }
+ ASSERT(i8count == sfp->hdr.i8count);
+ ASSERT(XFS_BIG_INUMS || i8count == 0);
+ ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
+ ASSERT(offset +
+ (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+ (uint)sizeof(xfs_dir2_block_tail_t) <=
+ dp->i_mount->m_dirblksize);
+}
+#endif /* DEBUG */
+
+/*
+ * Create a new (shortform) directory.
+ */
+int /* error, always 0 */
+xfs_dir2_sf_create(
+ xfs_da_args_t *args, /* operation arguments */
+ xfs_ino_t pino) /* parent inode number */
+{
+ xfs_inode_t *dp; /* incore directory inode */
+ int i8count; /* parent inode is an 8-byte number */
+ xfs_dir2_sf_t *sfp; /* shortform structure */
+ int size; /* directory size */
+
+ trace_xfs_dir2_sf_create(args);
+
+ dp = args->dp;
+
+ ASSERT(dp != NULL);
+ ASSERT(dp->i_d.di_size == 0);
+ /*
+ * If it's currently a zero-length extent file,
+ * convert it to local format.
+ */
+ if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+ dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */
+ dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+ dp->i_df.if_flags |= XFS_IFINLINE;
+ }
+ ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ ASSERT(dp->i_df.if_bytes == 0);
+ i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
+ size = xfs_dir2_sf_hdr_size(i8count);
+ /*
+ * Make a buffer for the data.
+ */
+ xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+ /*
+ * Fill in the header,
+ */
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ sfp->hdr.i8count = i8count;
+ /*
+ * Now can put in the inode number, since i8count is set.
+ */
+ xfs_dir2_sf_put_inumber(sfp, &pino, &sfp->hdr.parent);
+ sfp->hdr.count = 0;
+ dp->i_d.di_size = size;
+ xfs_dir2_sf_check(args);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ return 0;
+}
+
+int /* error */
+xfs_dir2_sf_getdents(
+ xfs_inode_t *dp, /* incore directory inode */
+ void *dirent,
+ xfs_off_t *offset,
+ filldir_t filldir)
+{
+ int i; /* shortform entry number */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_dataptr_t off; /* current entry's offset */
+ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
+ xfs_dir2_sf_t *sfp; /* shortform structure */
+ xfs_dir2_dataptr_t dot_offset;
+ xfs_dir2_dataptr_t dotdot_offset;
+ xfs_ino_t ino;
+
+ mp = dp->i_mount;
+
+ ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ /*
+ * Give up if the directory is way too short.
+ */
+ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ return XFS_ERROR(EIO);
+ }
+
+ ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+
+ ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+
+ /*
+ * If the block number in the offset is out of range, we're done.
+ */
+ if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+ return 0;
+
+ /*
+ * Precalculate offsets for . and .. as we will always need them.
+ *
+ * XXX(hch): the second argument is sometimes 0 and sometimes
+ * mp->m_dirdatablk.
+ */
+ dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+ XFS_DIR2_DATA_DOT_OFFSET);
+ dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+ XFS_DIR2_DATA_DOTDOT_OFFSET);
+
+ /*
+ * Put . entry unless we're starting past it.
+ */
+ if (*offset <= dot_offset) {
+ if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
+ *offset = dot_offset & 0x7fffffff;
+ return 0;
+ }
+ }
+
+ /*
+ * Put .. entry unless we're starting past it.
+ */
+ if (*offset <= dotdot_offset) {
+ ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
+ if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
+ *offset = dotdot_offset & 0x7fffffff;
+ return 0;
+ }
+ }
+
+ /*
+ * Loop while there are more entries and put'ing works.
+ */
+ sfep = xfs_dir2_sf_firstentry(sfp);
+ for (i = 0; i < sfp->hdr.count; i++) {
+ off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+ xfs_dir2_sf_get_offset(sfep));
+
+ if (*offset > off) {
+ sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+ continue;
+ }
+
+ ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
+ if (filldir(dirent, (char *)sfep->name, sfep->namelen,
+ off & 0x7fffffff, ino, DT_UNKNOWN)) {
+ *offset = off & 0x7fffffff;
+ return 0;
+ }
+ sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+ }
+
+ *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+ 0x7fffffff;
+ return 0;
+}
+
+/*
+ * Lookup an entry in a shortform directory.
+ * Returns EEXIST if found, ENOENT if not found.
+ */
+int /* error */
+xfs_dir2_sf_lookup(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_inode_t *dp; /* incore directory inode */
+ int i; /* entry index */
+ int error;
+ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
+ xfs_dir2_sf_t *sfp; /* shortform structure */
+ enum xfs_dacmp cmp; /* comparison result */
+ xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */
+
+ trace_xfs_dir2_sf_lookup(args);
+
+ xfs_dir2_sf_check(args);
+ dp = args->dp;
+
+ ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ /*
+ * Bail out if the directory is way too short.
+ */
+ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+ ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+ return XFS_ERROR(EIO);
+ }
+ ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+ /*
+ * Special case for .
+ */
+ if (args->namelen == 1 && args->name[0] == '.') {
+ args->inumber = dp->i_ino;
+ args->cmpresult = XFS_CMP_EXACT;
+ return XFS_ERROR(EEXIST);
+ }
+ /*
+ * Special case for ..
+ */
+ if (args->namelen == 2 &&
+ args->name[0] == '.' && args->name[1] == '.') {
+ args->inumber = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
+ args->cmpresult = XFS_CMP_EXACT;
+ return XFS_ERROR(EEXIST);
+ }
+ /*
+ * Loop over all the entries trying to match ours.
+ */
+ ci_sfep = NULL;
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count;
+ i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+ /*
+ * Compare name and if it's an exact match, return the inode
+ * number. If it's the first case-insensitive match, store the
+ * inode number and continue looking for an exact match.
+ */
+ cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
+ sfep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
+ args->inumber = xfs_dir2_sf_get_inumber(sfp,
+ xfs_dir2_sf_inumberp(sfep));
+ if (cmp == XFS_CMP_EXACT)
+ return XFS_ERROR(EEXIST);
+ ci_sfep = sfep;
+ }
+ }
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ /*
+ * Here, we can only be doing a lookup (not a rename or replace).
+ * If a case-insensitive match was not found, return ENOENT.
+ */
+ if (!ci_sfep)
+ return XFS_ERROR(ENOENT);
+ /* otherwise process the CI match as required by the caller */
+ error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+ return XFS_ERROR(error);
+}
+
+/*
+ * Remove an entry from a shortform directory.
+ */
+int /* error */
+xfs_dir2_sf_removename(
+ xfs_da_args_t *args)
+{
+ int byteoff; /* offset of removed entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int entsize; /* this entry's size */
+ int i; /* shortform entry index */
+ int newsize; /* new inode size */
+ int oldsize; /* old inode size */
+ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
+ xfs_dir2_sf_t *sfp; /* shortform structure */
+
+ trace_xfs_dir2_sf_removename(args);
+
+ dp = args->dp;
+
+ ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ oldsize = (int)dp->i_d.di_size;
+ /*
+ * Bail out if the directory is way too short.
+ */
+ if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+ ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+ return XFS_ERROR(EIO);
+ }
+ ASSERT(dp->i_df.if_bytes == oldsize);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+ /*
+ * Loop over the old directory entries.
+ * Find the one we're deleting.
+ */
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count;
+ i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+ if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+ XFS_CMP_EXACT) {
+ ASSERT(xfs_dir2_sf_get_inumber(sfp,
+ xfs_dir2_sf_inumberp(sfep)) ==
+ args->inumber);
+ break;
+ }
+ }
+ /*
+ * Didn't find it.
+ */
+ if (i == sfp->hdr.count)
+ return XFS_ERROR(ENOENT);
+ /*
+ * Calculate sizes.
+ */
+ byteoff = (int)((char *)sfep - (char *)sfp);
+ entsize = xfs_dir2_sf_entsize_byname(sfp, args->namelen);
+ newsize = oldsize - entsize;
+ /*
+ * Copy the part if any after the removed entry, sliding it down.
+ */
+ if (byteoff + entsize < oldsize)
+ memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
+ oldsize - (byteoff + entsize));
+ /*
+ * Fix up the header and file size.
+ */
+ sfp->hdr.count--;
+ dp->i_d.di_size = newsize;
+ /*
+ * Reallocate, making it smaller.
+ */
+ xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+#if XFS_BIG_INUMS
+ /*
+ * Are we changing inode number size?
+ */
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+ if (sfp->hdr.i8count == 1)
+ xfs_dir2_sf_toino4(args);
+ else
+ sfp->hdr.i8count--;
+ }
+#endif
+ xfs_dir2_sf_check(args);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+ return 0;
+}
+
+/*
+ * Replace the inode number of an entry in a shortform directory.
+ */
+int /* error */
+xfs_dir2_sf_replace(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ xfs_inode_t *dp; /* incore directory inode */
+ int i; /* entry index */
+#if XFS_BIG_INUMS || defined(DEBUG)
+ xfs_ino_t ino=0; /* entry old inode number */
+#endif
+#if XFS_BIG_INUMS
+ int i8elevated; /* sf_toino8 set i8count=1 */
+#endif
+ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
+ xfs_dir2_sf_t *sfp; /* shortform structure */
+
+ trace_xfs_dir2_sf_replace(args);
+
+ dp = args->dp;
+
+ ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+ /*
+ * Bail out if the shortform directory is way too small.
+ */
+ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+ ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+ return XFS_ERROR(EIO);
+ }
+ ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+ ASSERT(dp->i_df.if_u1.if_data != NULL);
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->hdr.i8count));
+#if XFS_BIG_INUMS
+ /*
+ * New inode number is large, and need to convert to 8-byte inodes.
+ */
+ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+ int error; /* error return value */
+ int newsize; /* new inode size */
+
+ newsize =
+ dp->i_df.if_bytes +
+ (sfp->hdr.count + 1) *
+ ((uint)sizeof(xfs_dir2_ino8_t) -
+ (uint)sizeof(xfs_dir2_ino4_t));
+ /*
+ * Won't fit as shortform, convert to block then do replace.
+ */
+ if (newsize > XFS_IFORK_DSIZE(dp)) {
+ error = xfs_dir2_sf_to_block(args);
+ if (error) {
+ return error;
+ }
+ return xfs_dir2_block_replace(args);
+ }
+ /*
+ * Still fits, convert to 8-byte now.
+ */
+ xfs_dir2_sf_toino8(args);
+ i8elevated = 1;
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ } else
+ i8elevated = 0;
+#endif
+ ASSERT(args->namelen != 1 || args->name[0] != '.');
+ /*
+ * Replace ..'s entry.
+ */
+ if (args->namelen == 2 &&
+ args->name[0] == '.' && args->name[1] == '.') {
+#if XFS_BIG_INUMS || defined(DEBUG)
+ ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
+ ASSERT(args->inumber != ino);
+#endif
+ xfs_dir2_sf_put_inumber(sfp, &args->inumber, &sfp->hdr.parent);
+ }
+ /*
+ * Normal entry, look for the name.
+ */
+ else {
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+ i < sfp->hdr.count;
+ i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+ if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+ XFS_CMP_EXACT) {
+#if XFS_BIG_INUMS || defined(DEBUG)
+ ino = xfs_dir2_sf_get_inumber(sfp,
+ xfs_dir2_sf_inumberp(sfep));
+ ASSERT(args->inumber != ino);
+#endif
+ xfs_dir2_sf_put_inumber(sfp, &args->inumber,
+ xfs_dir2_sf_inumberp(sfep));
+ break;
+ }
+ }
+ /*
+ * Didn't find it.
+ */
+ if (i == sfp->hdr.count) {
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+#if XFS_BIG_INUMS
+ if (i8elevated)
+ xfs_dir2_sf_toino4(args);
+#endif
+ return XFS_ERROR(ENOENT);
+ }
+ }
+#if XFS_BIG_INUMS
+ /*
+ * See if the old number was large, the new number is small.
+ */
+ if (ino > XFS_DIR2_MAX_SHORT_INUM &&
+ args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
+ /*
+ * And the old count was one, so need to convert to small.
+ */
+ if (sfp->hdr.i8count == 1)
+ xfs_dir2_sf_toino4(args);
+ else
+ sfp->hdr.i8count--;
+ }
+ /*
+ * See if the old number was small, the new number is large.
+ */
+ if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
+ args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+ /*
+ * add to the i8count unless we just converted to 8-byte
+ * inodes (which does an implied i8count = 1)
+ */
+ ASSERT(sfp->hdr.i8count != 0);
+ if (!i8elevated)
+ sfp->hdr.i8count++;
+ }
+#endif
+ xfs_dir2_sf_check(args);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+ return 0;
+}
+
+#if XFS_BIG_INUMS
+/*
+ * Convert from 8-byte inode numbers to 4-byte inode numbers.
+ * The last 8-byte inode number is gone, but the count is still 1.
+ */
+static void
+xfs_dir2_sf_toino4(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ char *buf; /* old dir's buffer */
+ xfs_inode_t *dp; /* incore directory inode */
+ int i; /* entry index */
+ xfs_ino_t ino; /* entry inode number */
+ int newsize; /* new inode size */
+ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
+ xfs_dir2_sf_t *oldsfp; /* old sf directory */
+ int oldsize; /* old inode size */
+ xfs_dir2_sf_entry_t *sfep; /* new sf entry */
+ xfs_dir2_sf_t *sfp; /* new sf directory */
+
+ trace_xfs_dir2_sf_toino4(args);
+
+ dp = args->dp;
+
+ /*
+ * Copy the old directory to the buffer.
+ * Then nuke it from the inode, and add the new buffer to the inode.
+ * Don't want xfs_idata_realloc copying the data here.
+ */
+ oldsize = dp->i_df.if_bytes;
+ buf = kmem_alloc(oldsize, KM_SLEEP);
+ oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ ASSERT(oldsfp->hdr.i8count == 1);
+ memcpy(buf, oldsfp, oldsize);
+ /*
+ * Compute the new inode size.
+ */
+ newsize =
+ oldsize -
+ (oldsfp->hdr.count + 1) *
+ ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+ xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+ xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+ /*
+ * Reset our pointers, the data has moved.
+ */
+ oldsfp = (xfs_dir2_sf_t *)buf;
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ /*
+ * Fill in the new header.
+ */
+ sfp->hdr.count = oldsfp->hdr.count;
+ sfp->hdr.i8count = 0;
+ ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent);
+ xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent);
+ /*
+ * Copy the entries field by field.
+ */
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+ oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+ i < sfp->hdr.count;
+ i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
+ oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
+ sfep->namelen = oldsfep->namelen;
+ sfep->offset = oldsfep->offset;
+ memcpy(sfep->name, oldsfep->name, sfep->namelen);
+ ino = xfs_dir2_sf_get_inumber(oldsfp,
+ xfs_dir2_sf_inumberp(oldsfep));
+ xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep));
+ }
+ /*
+ * Clean up the inode.
+ */
+ kmem_free(buf);
+ dp->i_d.di_size = newsize;
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+
+/*
+ * Convert from 4-byte inode numbers to 8-byte inode numbers.
+ * The new 8-byte inode number is not there yet, we leave with the
+ * count 1 but no corresponding entry.
+ */
+static void
+xfs_dir2_sf_toino8(
+ xfs_da_args_t *args) /* operation arguments */
+{
+ char *buf; /* old dir's buffer */
+ xfs_inode_t *dp; /* incore directory inode */
+ int i; /* entry index */
+ xfs_ino_t ino; /* entry inode number */
+ int newsize; /* new inode size */
+ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
+ xfs_dir2_sf_t *oldsfp; /* old sf directory */
+ int oldsize; /* old inode size */
+ xfs_dir2_sf_entry_t *sfep; /* new sf entry */
+ xfs_dir2_sf_t *sfp; /* new sf directory */
+
+ trace_xfs_dir2_sf_toino8(args);
+
+ dp = args->dp;
+
+ /*
+ * Copy the old directory to the buffer.
+ * Then nuke it from the inode, and add the new buffer to the inode.
+ * Don't want xfs_idata_realloc copying the data here.
+ */
+ oldsize = dp->i_df.if_bytes;
+ buf = kmem_alloc(oldsize, KM_SLEEP);
+ oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ ASSERT(oldsfp->hdr.i8count == 0);
+ memcpy(buf, oldsfp, oldsize);
+ /*
+ * Compute the new inode size.
+ */
+ newsize =
+ oldsize +
+ (oldsfp->hdr.count + 1) *
+ ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+ xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+ xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+ /*
+ * Reset our pointers, the data has moved.
+ */
+ oldsfp = (xfs_dir2_sf_t *)buf;
+ sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+ /*
+ * Fill in the new header.
+ */
+ sfp->hdr.count = oldsfp->hdr.count;
+ sfp->hdr.i8count = 1;
+ ino = xfs_dir2_sf_get_inumber(oldsfp, &oldsfp->hdr.parent);
+ xfs_dir2_sf_put_inumber(sfp, &ino, &sfp->hdr.parent);
+ /*
+ * Copy the entries field by field.
+ */
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
+ oldsfep = xfs_dir2_sf_firstentry(oldsfp);
+ i < sfp->hdr.count;
+ i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
+ oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
+ sfep->namelen = oldsfep->namelen;
+ sfep->offset = oldsfep->offset;
+ memcpy(sfep->name, oldsfep->name, sfep->namelen);
+ ino = xfs_dir2_sf_get_inumber(oldsfp,
+ xfs_dir2_sf_inumberp(oldsfep));
+ xfs_dir2_sf_put_inumber(sfp, &ino, xfs_dir2_sf_inumberp(sfep));
+ }
+ /*
+ * Clean up the inode.
+ */
+ kmem_free(buf);
+ dp->i_d.di_size = newsize;
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+#endif /* XFS_BIG_INUMS */
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
new file mode 100644
index 00000000..6ac44b55
--- /dev/null
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_DIR2_SF_H__
+#define __XFS_DIR2_SF_H__
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to
+ * fit into the literal area of the inode.
+ */
+
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_block;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Inode number stored as 8 8-bit values.
+ */
+typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+
+/*
+ * Inode number stored as 4 8-bit values.
+ * Works a lot of the time, when all the inode numbers in a directory
+ * fit in 32 bits.
+ */
+typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+
+typedef union {
+ xfs_dir2_ino8_t i8;
+ xfs_dir2_ino4_t i4;
+} xfs_dir2_inou_t;
+#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
+
+/*
+ * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
+ * Only need 16 bits, this is the byte offset into the single block form.
+ */
+typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
+
+/*
+ * The parent directory has a dedicated field, and the self-pointer must
+ * be calculated on the fly.
+ *
+ * Entries are packed toward the top as tightly as possible. The header
+ * and the elements must be memcpy'd out into a work area to get correct
+ * alignment for the inode number fields.
+ */
+typedef struct xfs_dir2_sf_hdr {
+ __uint8_t count; /* count of entries */
+ __uint8_t i8count; /* count of 8-byte inode #s */
+ xfs_dir2_inou_t parent; /* parent dir inode number */
+} __arch_pack xfs_dir2_sf_hdr_t;
+
+typedef struct xfs_dir2_sf_entry {
+ __uint8_t namelen; /* actual name length */
+ xfs_dir2_sf_off_t offset; /* saved offset */
+ __uint8_t name[1]; /* name, variable size */
+ xfs_dir2_inou_t inumber; /* inode number, var. offset */
+} __arch_pack xfs_dir2_sf_entry_t;
+
+typedef struct xfs_dir2_sf {
+ xfs_dir2_sf_hdr_t hdr; /* shortform header */
+ xfs_dir2_sf_entry_t list[1]; /* shortform entries */
+} xfs_dir2_sf_t;
+
+static inline int xfs_dir2_sf_hdr_size(int i8count)
+{
+ return ((uint)sizeof(xfs_dir2_sf_hdr_t) - \
+ ((i8count) == 0) * \
+ ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
+}
+
+static inline xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep)
+{
+ return (xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen];
+}
+
+static inline xfs_intino_t
+xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from)
+{
+ return ((sfp)->hdr.i8count == 0 ? \
+ (xfs_intino_t)XFS_GET_DIR_INO4((from)->i4) : \
+ (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8));
+}
+
+static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from,
+ xfs_dir2_inou_t *to)
+{
+ if ((sfp)->hdr.i8count == 0)
+ XFS_PUT_DIR_INO4(*(from), (to)->i4);
+ else
+ XFS_PUT_DIR_INO8(*(from), (to)->i8);
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
+{
+ return INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i);
+}
+
+static inline void
+xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
+{
+ INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i, off);
+}
+
+static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len)
+{
+ return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \
+ ((sfp)->hdr.i8count == 0) * \
+ ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
+}
+
+static inline int
+xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
+{
+ return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (sfep)->namelen - \
+ ((sfp)->hdr.i8count == 0) * \
+ ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)));
+}
+
+static inline xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp)
+{
+ return ((xfs_dir2_sf_entry_t *) \
+ ((char *)(sfp) + xfs_dir2_sf_hdr_size(sfp->hdr.i8count)));
+}
+
+static inline xfs_dir2_sf_entry_t *
+xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
+{
+ return ((xfs_dir2_sf_entry_t *) \
+ ((char *)(sfep) + xfs_dir2_sf_entsize_byentry(sfp,sfep)));
+}
+
+/*
+ * Functions.
+ */
+extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
+ struct xfs_dir2_block *block,
+ xfs_dir2_sf_hdr_t *sfhp);
+extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp,
+ int size, xfs_dir2_sf_hdr_t *sfhp);
+extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
+extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent,
+ xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
+extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
+extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
+
+#endif /* __XFS_DIR2_SF_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
new file mode 100644
index 00000000..39f06336
--- /dev/null
+++ b/fs/xfs/xfs_error.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_utils.h"
+#include "xfs_error.h"
+
+#ifdef DEBUG
+
+int xfs_etrap[XFS_ERROR_NTRAP] = {
+ 0,
+};
+
+int
+xfs_error_trap(int e)
+{
+ int i;
+
+ if (!e)
+ return 0;
+ for (i = 0; i < XFS_ERROR_NTRAP; i++) {
+ if (xfs_etrap[i] == 0)
+ break;
+ if (e != xfs_etrap[i])
+ continue;
+ xfs_notice(NULL, "%s: error %d", __func__, e);
+ BUG();
+ break;
+ }
+ return e;
+}
+
+int xfs_etest[XFS_NUM_INJECT_ERROR];
+int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
+char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+int xfs_error_test_active;
+
+int
+xfs_error_test(int error_tag, int *fsidp, char *expression,
+ int line, char *file, unsigned long randfactor)
+{
+ int i;
+ int64_t fsid;
+
+ if (random32() % randfactor)
+ return 0;
+
+ memcpy(&fsid, fsidp, sizeof(xfs_fsid_t));
+
+ for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+ if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
+ xfs_warn(NULL,
+ "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
+ expression, file, line, xfs_etest_fsname[i]);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+int
+xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+{
+ int i;
+ int len;
+ int64_t fsid;
+
+ memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
+
+ for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+ if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
+ xfs_warn(mp, "error tag #%d on", error_tag);
+ return 0;
+ }
+ }
+
+ for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+ if (xfs_etest[i] == 0) {
+ xfs_warn(mp, "Turned on XFS error tag #%d",
+ error_tag);
+ xfs_etest[i] = error_tag;
+ xfs_etest_fsid[i] = fsid;
+ len = strlen(mp->m_fsname);
+ xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
+ strcpy(xfs_etest_fsname[i], mp->m_fsname);
+ xfs_error_test_active++;
+ return 0;
+ }
+ }
+
+ xfs_warn(mp, "error tag overflow, too many turned on");
+
+ return 1;
+}
+
+int
+xfs_errortag_clearall(xfs_mount_t *mp, int loud)
+{
+ int64_t fsid;
+ int cleared = 0;
+ int i;
+
+ memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
+
+
+ for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+ if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
+ xfs_etest[i] != 0) {
+ cleared = 1;
+ xfs_warn(mp, "Clearing XFS error tag #%d",
+ xfs_etest[i]);
+ xfs_etest[i] = 0;
+ xfs_etest_fsid[i] = 0LL;
+ kmem_free(xfs_etest_fsname[i]);
+ xfs_etest_fsname[i] = NULL;
+ xfs_error_test_active--;
+ }
+ }
+
+ if (loud || cleared)
+ xfs_warn(mp, "Cleared all XFS error tags for filesystem");
+
+ return 0;
+}
+#endif /* DEBUG */
+
+void
+xfs_error_report(
+ const char *tag,
+ int level,
+ struct xfs_mount *mp,
+ const char *filename,
+ int linenum,
+ inst_t *ra)
+{
+ if (level <= xfs_error_level) {
+ xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
+ "Internal error %s at line %d of file %s. Caller 0x%p\n",
+ tag, linenum, filename, ra);
+
+ xfs_stack_trace();
+ }
+}
+
+void
+xfs_corruption_error(
+ const char *tag,
+ int level,
+ struct xfs_mount *mp,
+ void *p,
+ const char *filename,
+ int linenum,
+ inst_t *ra)
+{
+ if (level <= xfs_error_level)
+ xfs_hex_dump(p, 16);
+ xfs_error_report(tag, level, mp, filename, linenum, ra);
+ xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
+}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
new file mode 100644
index 00000000..079a367f
--- /dev/null
+++ b/fs/xfs/xfs_error.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ERROR_H__
+#define __XFS_ERROR_H__
+
+#ifdef DEBUG
+#define XFS_ERROR_NTRAP 10
+extern int xfs_etrap[XFS_ERROR_NTRAP];
+extern int xfs_error_trap(int);
+#define XFS_ERROR(e) xfs_error_trap(e)
+#else
+#define XFS_ERROR(e) (e)
+#endif
+
+struct xfs_mount;
+
+extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
+ const char *filename, int linenum, inst_t *ra);
+extern void xfs_corruption_error(const char *tag, int level,
+ struct xfs_mount *mp, void *p, const char *filename,
+ int linenum, inst_t *ra);
+
+#define XFS_ERROR_REPORT(e, lvl, mp) \
+ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
+#define XFS_CORRUPTION_ERROR(e, lvl, mp, mem) \
+ xfs_corruption_error(e, lvl, mp, mem, \
+ __FILE__, __LINE__, __return_address)
+
+#define XFS_ERRLEVEL_OFF 0
+#define XFS_ERRLEVEL_LOW 1
+#define XFS_ERRLEVEL_HIGH 5
+
+/*
+ * Macros to set EFSCORRUPTED & return/branch.
+ */
+#define XFS_WANT_CORRUPTED_GOTO(x,l) \
+ { \
+ int fs_is_ok = (x); \
+ ASSERT(fs_is_ok); \
+ if (unlikely(!fs_is_ok)) { \
+ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
+ XFS_ERRLEVEL_LOW, NULL); \
+ error = XFS_ERROR(EFSCORRUPTED); \
+ goto l; \
+ } \
+ }
+
+#define XFS_WANT_CORRUPTED_RETURN(x) \
+ { \
+ int fs_is_ok = (x); \
+ ASSERT(fs_is_ok); \
+ if (unlikely(!fs_is_ok)) { \
+ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
+ XFS_ERRLEVEL_LOW, NULL); \
+ return XFS_ERROR(EFSCORRUPTED); \
+ } \
+ }
+
+/*
+ * error injection tags - the labels can be anything you want
+ * but each tag should have its own unique number
+ */
+
+#define XFS_ERRTAG_NOERROR 0
+#define XFS_ERRTAG_IFLUSH_1 1
+#define XFS_ERRTAG_IFLUSH_2 2
+#define XFS_ERRTAG_IFLUSH_3 3
+#define XFS_ERRTAG_IFLUSH_4 4
+#define XFS_ERRTAG_IFLUSH_5 5
+#define XFS_ERRTAG_IFLUSH_6 6
+#define XFS_ERRTAG_DA_READ_BUF 7
+#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8
+#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9
+#define XFS_ERRTAG_ALLOC_READ_AGF 10
+#define XFS_ERRTAG_IALLOC_READ_AGI 11
+#define XFS_ERRTAG_ITOBP_INOTOBP 12
+#define XFS_ERRTAG_IUNLINK 13
+#define XFS_ERRTAG_IUNLINK_REMOVE 14
+#define XFS_ERRTAG_DIR_INO_VALIDATE 15
+#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16
+#define XFS_ERRTAG_IODONE_IOERR 17
+#define XFS_ERRTAG_STRATREAD_IOERR 18
+#define XFS_ERRTAG_STRATCMPL_IOERR 19
+#define XFS_ERRTAG_DIOWRITE_IOERR 20
+#define XFS_ERRTAG_BMAPIFORMAT 21
+#define XFS_ERRTAG_MAX 22
+
+/*
+ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
+ */
+#define XFS_RANDOM_DEFAULT 100
+#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4)
+#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
+
+#ifdef DEBUG
+extern int xfs_error_test_active;
+extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
+
+#define XFS_NUM_INJECT_ERROR 10
+#define XFS_TEST_ERROR(expr, mp, tag, rf) \
+ ((expr) || (xfs_error_test_active && \
+ xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
+ (rf))))
+
+extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
+#else
+#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
+#define xfs_errortag_add(tag, mp) (ENOSYS)
+#define xfs_errortag_clearall(mp, loud) (ENOSYS)
+#endif /* DEBUG */
+
+/*
+ * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
+ * a panic by setting xfs_panic_mask in a sysctl.
+ */
+#define XFS_NO_PTAG 0
+#define XFS_PTAG_IFLUSH 0x00000001
+#define XFS_PTAG_LOGRES 0x00000002
+#define XFS_PTAG_AILDELETE 0x00000004
+#define XFS_PTAG_ERROR_REPORT 0x00000008
+#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
+#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
+#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
+#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
+
+#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
new file mode 100644
index 00000000..d22e6262
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.c
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_extfree_item.h"
+
+
+kmem_zone_t *xfs_efi_zone;
+kmem_zone_t *xfs_efd_zone;
+
+static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_efi_log_item, efi_item);
+}
+
+void
+xfs_efi_item_free(
+ struct xfs_efi_log_item *efip)
+{
+ if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
+ kmem_free(efip);
+ else
+ kmem_zone_free(xfs_efi_zone, efip);
+}
+
+/*
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the
+ * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
+ * the EFI.
+ */
+STATIC void
+__xfs_efi_release(
+ struct xfs_efi_log_item *efip)
+{
+ struct xfs_ail *ailp = efip->efi_item.li_ailp;
+
+ if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+ spin_lock(&ailp->xa_lock);
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ xfs_trans_ail_delete(ailp, &efip->efi_item);
+ xfs_efi_item_free(efip);
+ }
+}
+
+/*
+ * This returns the number of iovecs needed to log the given efi item.
+ * We only need 1 iovec for an efi item. It just logs the efi_log_format
+ * structure.
+ */
+STATIC uint
+xfs_efi_item_size(
+ struct xfs_log_item *lip)
+{
+ return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efi log item. We use only 1 iovec, and we point that
+ * at the efi_log_format structure embedded in the efi item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efi item have been filled.
+ */
+STATIC void
+xfs_efi_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *log_vector)
+{
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+ uint size;
+
+ ASSERT(atomic_read(&efip->efi_next_extent) ==
+ efip->efi_format.efi_nextents);
+
+ efip->efi_format.efi_type = XFS_LI_EFI;
+
+ size = sizeof(xfs_efi_log_format_t);
+ size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
+ efip->efi_format.efi_size = 1;
+
+ log_vector->i_addr = &efip->efi_format;
+ log_vector->i_len = size;
+ log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
+ ASSERT(size >= sizeof(xfs_efi_log_format_t));
+}
+
+
+/*
+ * Pinning has no meaning for an efi item, so just return.
+ */
+STATIC void
+xfs_efi_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * While EFIs cannot really be pinned, the unpin operation is the last place at
+ * which the EFI is manipulated during a transaction. If we are being asked to
+ * remove the EFI it's because the transaction has been cancelled and by
+ * definition that means the EFI cannot be in the AIL so remove it from the
+ * transaction and free it. Otherwise coordinate with xfs_efi_release() (via
+ * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
+ */
+STATIC void
+xfs_efi_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+
+ if (remove) {
+ ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
+ if (lip->li_desc)
+ xfs_trans_del_item(lip);
+ xfs_efi_item_free(efip);
+ return;
+ }
+ __xfs_efi_release(efip);
+}
+
+/*
+ * Efi items have no locking or pushing. However, since EFIs are
+ * pulled from the AIL when their corresponding EFDs are committed
+ * to disk, their situation is very similar to being pinned. Return
+ * XFS_ITEM_PINNED so that the caller will eventually flush the log.
+ * This should help in getting the EFI out of the AIL.
+ */
+STATIC uint
+xfs_efi_item_trylock(
+ struct xfs_log_item *lip)
+{
+ return XFS_ITEM_PINNED;
+}
+
+/*
+ * Efi items have no locking, so just return.
+ */
+STATIC void
+xfs_efi_item_unlock(
+ struct xfs_log_item *lip)
+{
+ if (lip->li_flags & XFS_LI_ABORTED)
+ xfs_efi_item_free(EFI_ITEM(lip));
+}
+
+/*
+ * The EFI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged. For bulk transaction committed
+ * processing, the EFI may be processed but not yet unpinned prior to the EFD
+ * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
+ * when processing the EFD.
+ */
+STATIC xfs_lsn_t
+xfs_efi_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+
+ set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
+ return lsn;
+}
+
+/*
+ * There isn't much you can do to push on an efi item. It is simply
+ * stuck waiting for all of its corresponding efd items to be
+ * committed to disk.
+ */
+STATIC void
+xfs_efi_item_push(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * The EFI dependency tracking op doesn't do squat. It can't because
+ * it doesn't know where the free extent is coming from. The dependency
+ * tracking has to be handled by the "enclosing" metadata object. For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_efi_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all efi log items.
+ */
+static struct xfs_item_ops xfs_efi_item_ops = {
+ .iop_size = xfs_efi_item_size,
+ .iop_format = xfs_efi_item_format,
+ .iop_pin = xfs_efi_item_pin,
+ .iop_unpin = xfs_efi_item_unpin,
+ .iop_trylock = xfs_efi_item_trylock,
+ .iop_unlock = xfs_efi_item_unlock,
+ .iop_committed = xfs_efi_item_committed,
+ .iop_push = xfs_efi_item_push,
+ .iop_committing = xfs_efi_item_committing
+};
+
+
+/*
+ * Allocate and initialize an efi item with the given number of extents.
+ */
+struct xfs_efi_log_item *
+xfs_efi_init(
+ struct xfs_mount *mp,
+ uint nextents)
+
+{
+ struct xfs_efi_log_item *efip;
+ uint size;
+
+ ASSERT(nextents > 0);
+ if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+ size = (uint)(sizeof(xfs_efi_log_item_t) +
+ ((nextents - 1) * sizeof(xfs_extent_t)));
+ efip = kmem_zalloc(size, KM_SLEEP);
+ } else {
+ efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
+ }
+
+ xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
+ efip->efi_format.efi_nextents = nextents;
+ efip->efi_format.efi_id = (__psint_t)(void*)efip;
+ atomic_set(&efip->efi_next_extent, 0);
+
+ return efip;
+}
+
+/*
+ * Copy an EFI format buffer from the given buf, and into the destination
+ * EFI format structure.
+ * The given buffer can be in 32 bit or 64 bit form (which has different padding),
+ * one of which will be the native format for this kernel.
+ * It will handle the conversion of formats if necessary.
+ */
+int
+xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
+{
+ xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
+ uint i;
+ uint len = sizeof(xfs_efi_log_format_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);
+ uint len32 = sizeof(xfs_efi_log_format_32_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);
+ uint len64 = sizeof(xfs_efi_log_format_64_t) +
+ (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);
+
+ if (buf->i_len == len) {
+ memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
+ return 0;
+ } else if (buf->i_len == len32) {
+ xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
+
+ dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type;
+ dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size;
+ dst_efi_fmt->efi_nextents = src_efi_fmt_32->efi_nextents;
+ dst_efi_fmt->efi_id = src_efi_fmt_32->efi_id;
+ for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+ dst_efi_fmt->efi_extents[i].ext_start =
+ src_efi_fmt_32->efi_extents[i].ext_start;
+ dst_efi_fmt->efi_extents[i].ext_len =
+ src_efi_fmt_32->efi_extents[i].ext_len;
+ }
+ return 0;
+ } else if (buf->i_len == len64) {
+ xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
+
+ dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type;
+ dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size;
+ dst_efi_fmt->efi_nextents = src_efi_fmt_64->efi_nextents;
+ dst_efi_fmt->efi_id = src_efi_fmt_64->efi_id;
+ for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+ dst_efi_fmt->efi_extents[i].ext_start =
+ src_efi_fmt_64->efi_extents[i].ext_start;
+ dst_efi_fmt->efi_extents[i].ext_len =
+ src_efi_fmt_64->efi_extents[i].ext_len;
+ }
+ return 0;
+ }
+ return EFSCORRUPTED;
+}
+
+/*
+ * This is called by the efd item code below to release references to the given
+ * efi item. Each efd calls this with the number of extents that it has
+ * logged, and when the sum of these reaches the total number of extents logged
+ * by this efi item we can free the efi item.
+ */
+void
+xfs_efi_release(xfs_efi_log_item_t *efip,
+ uint nextents)
+{
+ ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
+ if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+ __xfs_efi_release(efip);
+}
+
+static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_efd_log_item, efd_item);
+}
+
+STATIC void
+xfs_efd_item_free(struct xfs_efd_log_item *efdp)
+{
+ if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
+ kmem_free(efdp);
+ else
+ kmem_zone_free(xfs_efd_zone, efdp);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given efd item.
+ * We only need 1 iovec for an efd item. It just logs the efd_log_format
+ * structure.
+ */
+STATIC uint
+xfs_efd_item_size(
+ struct xfs_log_item *lip)
+{
+ return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efd log item. We use only 1 iovec, and we point that
+ * at the efd_log_format structure embedded in the efd item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efd item have been filled.
+ */
+STATIC void
+xfs_efd_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *log_vector)
+{
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+ uint size;
+
+ ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
+
+ efdp->efd_format.efd_type = XFS_LI_EFD;
+
+ size = sizeof(xfs_efd_log_format_t);
+ size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
+ efdp->efd_format.efd_size = 1;
+
+ log_vector->i_addr = &efdp->efd_format;
+ log_vector->i_len = size;
+ log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
+ ASSERT(size >= sizeof(xfs_efd_log_format_t));
+}
+
+/*
+ * Pinning has no meaning for an efd item, so just return.
+ */
+STATIC void
+xfs_efd_item_pin(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an efd item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_efd_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+}
+
+/*
+ * Efd items have no locking, so just return success.
+ */
+STATIC uint
+xfs_efd_item_trylock(
+ struct xfs_log_item *lip)
+{
+ return XFS_ITEM_LOCKED;
+}
+
+/*
+ * Efd items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+STATIC void
+xfs_efd_item_unlock(
+ struct xfs_log_item *lip)
+{
+ if (lip->li_flags & XFS_LI_ABORTED)
+ xfs_efd_item_free(EFD_ITEM(lip));
+}
+
+/*
+ * When the efd item is committed to disk, all we need to do
+ * is delete our reference to our partner efi item and then
+ * free ourselves. Since we're freeing ourselves we must
+ * return -1 to keep the transaction code from further referencing
+ * this item.
+ */
+STATIC xfs_lsn_t
+xfs_efd_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+
+ /*
+ * If we got a log I/O error, it's always the case that the LR with the
+ * EFI got unpinned and freed before the EFD got aborted.
+ */
+ if (!(lip->li_flags & XFS_LI_ABORTED))
+ xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
+
+ xfs_efd_item_free(efdp);
+ return (xfs_lsn_t)-1;
+}
+
+/*
+ * There isn't much you can do to push on an efd item. It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+STATIC void
+xfs_efd_item_push(
+ struct xfs_log_item *lip)
+{
+}
+
+/*
+ * The EFD dependency tracking op doesn't do squat. It can't because
+ * it doesn't know where the free extent is coming from. The dependency
+ * tracking has to be handled by the "enclosing" metadata object. For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_efd_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all efd log items.
+ */
+static struct xfs_item_ops xfs_efd_item_ops = {
+ .iop_size = xfs_efd_item_size,
+ .iop_format = xfs_efd_item_format,
+ .iop_pin = xfs_efd_item_pin,
+ .iop_unpin = xfs_efd_item_unpin,
+ .iop_trylock = xfs_efd_item_trylock,
+ .iop_unlock = xfs_efd_item_unlock,
+ .iop_committed = xfs_efd_item_committed,
+ .iop_push = xfs_efd_item_push,
+ .iop_committing = xfs_efd_item_committing
+};
+
+/*
+ * Allocate and initialize an efd item with the given number of extents.
+ */
+struct xfs_efd_log_item *
+xfs_efd_init(
+ struct xfs_mount *mp,
+ struct xfs_efi_log_item *efip,
+ uint nextents)
+
+{
+ struct xfs_efd_log_item *efdp;
+ uint size;
+
+ ASSERT(nextents > 0);
+ if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
+ size = (uint)(sizeof(xfs_efd_log_item_t) +
+ ((nextents - 1) * sizeof(xfs_extent_t)));
+ efdp = kmem_zalloc(size, KM_SLEEP);
+ } else {
+ efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
+ }
+
+ xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
+ efdp->efd_efip = efip;
+ efdp->efd_format.efd_nextents = nextents;
+ efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+
+ return efdp;
+}
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
new file mode 100644
index 00000000..375f68e4
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_EXTFREE_ITEM_H__
+#define __XFS_EXTFREE_ITEM_H__
+
+struct xfs_mount;
+struct kmem_zone;
+
+typedef struct xfs_extent {
+ xfs_dfsbno_t ext_start;
+ xfs_extlen_t ext_len;
+} xfs_extent_t;
+
+/*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+
+typedef struct xfs_extent_32 {
+ __uint64_t ext_start;
+ __uint32_t ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+
+typedef struct xfs_extent_64 {
+ __uint64_t ext_start;
+ __uint32_t ext_len;
+ __uint32_t ext_pad;
+} xfs_extent_64_t;
+
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log. The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+ __uint16_t efi_type; /* efi log item type */
+ __uint16_t efi_size; /* size of this item */
+ __uint32_t efi_nextents; /* # extents to free */
+ __uint64_t efi_id; /* efi identifier */
+ xfs_extent_t efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_t;
+
+typedef struct xfs_efi_log_format_32 {
+ __uint16_t efi_type; /* efi log item type */
+ __uint16_t efi_size; /* size of this item */
+ __uint32_t efi_nextents; /* # extents to free */
+ __uint64_t efi_id; /* efi identifier */
+ xfs_extent_32_t efi_extents[1]; /* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+
+typedef struct xfs_efi_log_format_64 {
+ __uint16_t efi_type; /* efi log item type */
+ __uint16_t efi_size; /* size of this item */
+ __uint32_t efi_nextents; /* # extents to free */
+ __uint64_t efi_id; /* efi identifier */
+ xfs_extent_64_t efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_64_t;
+
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log. The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+ __uint16_t efd_type; /* efd log item type */
+ __uint16_t efd_size; /* size of this item */
+ __uint32_t efd_nextents; /* # of extents freed */
+ __uint64_t efd_efi_id; /* id of corresponding efi */
+ xfs_extent_t efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_t;
+
+typedef struct xfs_efd_log_format_32 {
+ __uint16_t efd_type; /* efd log item type */
+ __uint16_t efd_size; /* size of this item */
+ __uint32_t efd_nextents; /* # of extents freed */
+ __uint64_t efd_efi_id; /* id of corresponding efi */
+ xfs_extent_32_t efd_extents[1]; /* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+
+typedef struct xfs_efd_log_format_64 {
+ __uint16_t efd_type; /* efd log item type */
+ __uint16_t efd_size; /* size of this item */
+ __uint32_t efd_nextents; /* # of extents freed */
+ __uint64_t efd_efi_id; /* id of corresponding efi */
+ xfs_extent_64_t efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_64_t;
+
+
+#ifdef __KERNEL__
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define XFS_EFI_MAX_FAST_EXTENTS 16
+
+/*
+ * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define XFS_EFI_RECOVERED 1
+#define XFS_EFI_COMMITTED 2
+
+/*
+ * This is the "extent free intention" log item. It is used
+ * to log the fact that some extents need to be free. It is
+ * used in conjunction with the "extent free done" log item
+ * described below.
+ */
+typedef struct xfs_efi_log_item {
+ xfs_log_item_t efi_item;
+ atomic_t efi_next_extent;
+ unsigned long efi_flags; /* misc flags */
+ xfs_efi_log_format_t efi_format;
+} xfs_efi_log_item_t;
+
+/*
+ * This is the "extent free done" log item. It is used to log
+ * the fact that some extents earlier mentioned in an efi item
+ * have been freed.
+ */
+typedef struct xfs_efd_log_item {
+ xfs_log_item_t efd_item;
+ xfs_efi_log_item_t *efd_efip;
+ uint efd_next_extent;
+ xfs_efd_log_format_t efd_format;
+} xfs_efd_log_item_t;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define XFS_EFD_MAX_FAST_EXTENTS 16
+
+extern struct kmem_zone *xfs_efi_zone;
+extern struct kmem_zone *xfs_efd_zone;
+
+xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint);
+xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
+ uint);
+int xfs_efi_copy_format(xfs_log_iovec_t *buf,
+ xfs_efi_log_format_t *dst_efi_fmt);
+void xfs_efi_item_free(xfs_efi_log_item_t *);
+
+#endif /* __KERNEL__ */
+
+#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
new file mode 100644
index 00000000..9124425b
--- /dev/null
+++ b/fs/xfs/xfs_filestream.c
@@ -0,0 +1,824 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inum.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_ag.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_bmap.h"
+#include "xfs_alloc.h"
+#include "xfs_utils.h"
+#include "xfs_mru_cache.h"
+#include "xfs_filestream.h"
+#include "xfs_trace.h"
+
+#ifdef XFS_FILESTREAMS_TRACE
+
+ktrace_t *xfs_filestreams_trace_buf;
+
+STATIC void
+xfs_filestreams_trace(
+ xfs_mount_t *mp, /* mount point */
+ int type, /* type of trace */
+ const char *func, /* source function */
+ int line, /* source line number */
+ __psunsigned_t arg0,
+ __psunsigned_t arg1,
+ __psunsigned_t arg2,
+ __psunsigned_t arg3,
+ __psunsigned_t arg4,
+ __psunsigned_t arg5)
+{
+ ktrace_enter(xfs_filestreams_trace_buf,
+ (void *)(__psint_t)(type | (line << 16)),
+ (void *)func,
+ (void *)(__psunsigned_t)current_pid(),
+ (void *)mp,
+ (void *)(__psunsigned_t)arg0,
+ (void *)(__psunsigned_t)arg1,
+ (void *)(__psunsigned_t)arg2,
+ (void *)(__psunsigned_t)arg3,
+ (void *)(__psunsigned_t)arg4,
+ (void *)(__psunsigned_t)arg5,
+ NULL, NULL, NULL, NULL, NULL, NULL);
+}
+
+#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0)
+#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0)
+#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0)
+#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0)
+#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0)
+#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0)
+#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
+ xfs_filestreams_trace(mp, t, __func__, __LINE__, \
+ (__psunsigned_t)a0, (__psunsigned_t)a1, \
+ (__psunsigned_t)a2, (__psunsigned_t)a3, \
+ (__psunsigned_t)a4, (__psunsigned_t)a5)
+
+#define TRACE_AG_SCAN(mp, ag, ag2) \
+ TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
+#define TRACE_AG_PICK1(mp, max_ag, maxfree) \
+ TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
+#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
+ TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
+ cnt, free, scan, flag)
+#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
+ TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
+#define TRACE_FREE(mp, ip, pip, ag, cnt) \
+ TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
+#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
+ TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
+#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
+ TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
+#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
+ TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
+#define TRACE_ORPHAN(mp, ip, ag) \
+ TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
+
+
+#else
+#define TRACE_AG_SCAN(mp, ag, ag2)
+#define TRACE_AG_PICK1(mp, max_ag, maxfree)
+#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
+#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
+#define TRACE_FREE(mp, ip, pip, ag, cnt)
+#define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
+#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
+#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
+#define TRACE_ORPHAN(mp, ip, ag)
+#endif
+
+static kmem_zone_t *item_zone;
+
+/*
+ * Structure for associating a file or a directory with an allocation group.
+ * The parent directory pointer is only needed for files, but since there will
+ * generally be vastly more files than directories in the cache, using the same
+ * data structure simplifies the code with very little memory overhead.
+ */
+typedef struct fstrm_item
+{
+ xfs_agnumber_t ag; /* AG currently in use for the file/directory. */
+ xfs_inode_t *ip; /* inode self-pointer. */
+ xfs_inode_t *pip; /* Parent directory inode pointer. */
+} fstrm_item_t;
+
+/*
+ * Allocation group filestream associations are tracked with per-ag atomic
+ * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
+ * particular AG already has active filestreams associated with it. The mount
+ * point's m_peraglock is used to protect these counters from per-ag array
+ * re-allocation during a growfs operation. When xfs_growfs_data_private() is
+ * about to reallocate the array, it calls xfs_filestream_flush() with the
+ * m_peraglock held in write mode.
+ *
+ * Since xfs_mru_cache_flush() guarantees that all the free functions for all
+ * the cache elements have finished executing before it returns, it's safe for
+ * the free functions to use the atomic counters without m_peraglock protection.
+ * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
+ * whether it was called with the m_peraglock held in read mode, write mode or
+ * not held at all. The race condition this addresses is the following:
+ *
+ * - The work queue scheduler fires and pulls a filestream directory cache
+ * element off the LRU end of the cache for deletion, then gets pre-empted.
+ * - A growfs operation grabs the m_peraglock in write mode, flushes all the
+ * remaining items from the cache and reallocates the mount point's per-ag
+ * array, resetting all the counters to zero.
+ * - The work queue thread resumes and calls the free function for the element
+ * it started cleaning up earlier. In the process it decrements the
+ * filestreams counter for an AG that now has no references.
+ *
+ * With a shrinkfs feature, the above scenario could panic the system.
+ *
+ * All other uses of the following macros should be protected by either the
+ * m_peraglock held in read mode, or the cache's internal locking exposed by the
+ * interval between a call to xfs_mru_cache_lookup() and a call to
+ * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
+ * when new elements are added to the cache.
+ *
+ * Combined, these locking rules ensure that no associations will ever exist in
+ * the cache that reference per-ag array elements that have since been
+ * reallocated.
+ */
+static int
+xfs_filestream_peek_ag(
+ xfs_mount_t *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int ret;
+
+ pag = xfs_perag_get(mp, agno);
+ ret = atomic_read(&pag->pagf_fstrms);
+ xfs_perag_put(pag);
+ return ret;
+}
+
+static int
+xfs_filestream_get_ag(
+ xfs_mount_t *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int ret;
+
+ pag = xfs_perag_get(mp, agno);
+ ret = atomic_inc_return(&pag->pagf_fstrms);
+ xfs_perag_put(pag);
+ return ret;
+}
+
+static void
+xfs_filestream_put_ag(
+ xfs_mount_t *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, agno);
+ atomic_dec(&pag->pagf_fstrms);
+ xfs_perag_put(pag);
+}
+
+/*
+ * Scan the AGs starting at startag looking for an AG that isn't in use and has
+ * at least minlen blocks free.
+ */
+static int
+_xfs_filestream_pick_ag(
+ xfs_mount_t *mp,
+ xfs_agnumber_t startag,
+ xfs_agnumber_t *agp,
+ int flags,
+ xfs_extlen_t minlen)
+{
+ int streams, max_streams;
+ int err, trylock, nscan;
+ xfs_extlen_t longest, free, minfree, maxfree = 0;
+ xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
+ struct xfs_perag *pag;
+
+ /* 2% of an AG's blocks must be free for it to be chosen. */
+ minfree = mp->m_sb.sb_agblocks / 50;
+
+ ag = startag;
+ *agp = NULLAGNUMBER;
+
+ /* For the first pass, don't sleep trying to init the per-AG. */
+ trylock = XFS_ALLOC_FLAG_TRYLOCK;
+
+ for (nscan = 0; 1; nscan++) {
+ pag = xfs_perag_get(mp, ag);
+ TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
+
+ if (!pag->pagf_init) {
+ err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
+ if (err && !trylock) {
+ xfs_perag_put(pag);
+ return err;
+ }
+ }
+
+ /* Might fail sometimes during the 1st pass with trylock set. */
+ if (!pag->pagf_init)
+ goto next_ag;
+
+ /* Keep track of the AG with the most free blocks. */
+ if (pag->pagf_freeblks > maxfree) {
+ maxfree = pag->pagf_freeblks;
+ max_streams = atomic_read(&pag->pagf_fstrms);
+ max_ag = ag;
+ }
+
+ /*
+ * The AG reference count does two things: it enforces mutual
+ * exclusion when examining the suitability of an AG in this
+ * loop, and it guards against two filestreams being established
+ * in the same AG as each other.
+ */
+ if (xfs_filestream_get_ag(mp, ag) > 1) {
+ xfs_filestream_put_ag(mp, ag);
+ goto next_ag;
+ }
+
+ longest = xfs_alloc_longest_free_extent(mp, pag);
+ if (((minlen && longest >= minlen) ||
+ (!minlen && pag->pagf_freeblks >= minfree)) &&
+ (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
+ (flags & XFS_PICK_LOWSPACE))) {
+
+ /* Break out, retaining the reference on the AG. */
+ free = pag->pagf_freeblks;
+ streams = atomic_read(&pag->pagf_fstrms);
+ xfs_perag_put(pag);
+ *agp = ag;
+ break;
+ }
+
+ /* Drop the reference on this AG, it's not usable. */
+ xfs_filestream_put_ag(mp, ag);
+next_ag:
+ xfs_perag_put(pag);
+ /* Move to the next AG, wrapping to AG 0 if necessary. */
+ if (++ag >= mp->m_sb.sb_agcount)
+ ag = 0;
+
+ /* If a full pass of the AGs hasn't been done yet, continue. */
+ if (ag != startag)
+ continue;
+
+ /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
+ if (trylock != 0) {
+ trylock = 0;
+ continue;
+ }
+
+ /* Finally, if lowspace wasn't set, set it for the 3rd pass. */
+ if (!(flags & XFS_PICK_LOWSPACE)) {
+ flags |= XFS_PICK_LOWSPACE;
+ continue;
+ }
+
+ /*
+ * Take the AG with the most free space, regardless of whether
+ * it's already in use by another filestream.
+ */
+ if (max_ag != NULLAGNUMBER) {
+ xfs_filestream_get_ag(mp, max_ag);
+ TRACE_AG_PICK1(mp, max_ag, maxfree);
+ streams = max_streams;
+ free = maxfree;
+ *agp = max_ag;
+ break;
+ }
+
+ /* take AG 0 if none matched */
+ TRACE_AG_PICK1(mp, max_ag, maxfree);
+ *agp = 0;
+ return 0;
+ }
+
+ TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
+
+ return 0;
+}
+
+/*
+ * Set the allocation group number for a file or a directory, updating inode
+ * references and per-AG references as appropriate.
+ */
+static int
+_xfs_filestream_update_ag(
+ xfs_inode_t *ip,
+ xfs_inode_t *pip,
+ xfs_agnumber_t ag)
+{
+ int err = 0;
+ xfs_mount_t *mp;
+ xfs_mru_cache_t *cache;
+ fstrm_item_t *item;
+ xfs_agnumber_t old_ag;
+ xfs_inode_t *old_pip;
+
+ /*
+ * Either ip is a regular file and pip is a directory, or ip is a
+ * directory and pip is NULL.
+ */
+ ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip &&
+ (pip->i_d.di_mode & S_IFDIR)) ||
+ ((ip->i_d.di_mode & S_IFDIR) && !pip)));
+
+ mp = ip->i_mount;
+ cache = mp->m_filestream;
+
+ item = xfs_mru_cache_lookup(cache, ip->i_ino);
+ if (item) {
+ ASSERT(item->ip == ip);
+ old_ag = item->ag;
+ item->ag = ag;
+ old_pip = item->pip;
+ item->pip = pip;
+ xfs_mru_cache_done(cache);
+
+ /*
+ * If the AG has changed, drop the old ref and take a new one,
+ * effectively transferring the reference from old to new AG.
+ */
+ if (ag != old_ag) {
+ xfs_filestream_put_ag(mp, old_ag);
+ xfs_filestream_get_ag(mp, ag);
+ }
+
+ /*
+ * If ip is a file and its pip has changed, drop the old ref and
+ * take a new one.
+ */
+ if (pip && pip != old_pip) {
+ IRELE(old_pip);
+ IHOLD(pip);
+ }
+
+ TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
+ ag, xfs_filestream_peek_ag(mp, ag));
+ return 0;
+ }
+
+ item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
+ if (!item)
+ return ENOMEM;
+
+ item->ag = ag;
+ item->ip = ip;
+ item->pip = pip;
+
+ err = xfs_mru_cache_insert(cache, ip->i_ino, item);
+ if (err) {
+ kmem_zone_free(item_zone, item);
+ return err;
+ }
+
+ /* Take a reference on the AG. */
+ xfs_filestream_get_ag(mp, ag);
+
+ /*
+ * Take a reference on the inode itself regardless of whether it's a
+ * regular file or a directory.
+ */
+ IHOLD(ip);
+
+ /*
+ * In the case of a regular file, take a reference on the parent inode
+ * as well to ensure it remains in-core.
+ */
+ if (pip)
+ IHOLD(pip);
+
+ TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
+ ag, xfs_filestream_peek_ag(mp, ag));
+
+ return 0;
+}
+
+/* xfs_fstrm_free_func(): callback for freeing cached stream items. */
+STATIC void
+xfs_fstrm_free_func(
+ unsigned long ino,
+ void *data)
+{
+ fstrm_item_t *item = (fstrm_item_t *)data;
+ xfs_inode_t *ip = item->ip;
+
+ ASSERT(ip->i_ino == ino);
+
+ xfs_iflags_clear(ip, XFS_IFILESTREAM);
+
+ /* Drop the reference taken on the AG when the item was added. */
+ xfs_filestream_put_ag(ip->i_mount, item->ag);
+
+ TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
+ xfs_filestream_peek_ag(ip->i_mount, item->ag));
+
+ /*
+ * _xfs_filestream_update_ag() always takes a reference on the inode
+ * itself, whether it's a file or a directory. Release it here.
+ * This can result in the inode being freed and so we must
+ * not hold any inode locks when freeing filesstreams objects
+ * otherwise we can deadlock here.
+ */
+ IRELE(ip);
+
+ /*
+ * In the case of a regular file, _xfs_filestream_update_ag() also
+ * takes a ref on the parent inode to keep it in-core. Release that
+ * too.
+ */
+ if (item->pip)
+ IRELE(item->pip);
+
+ /* Finally, free the memory allocated for the item. */
+ kmem_zone_free(item_zone, item);
+}
+
+/*
+ * xfs_filestream_init() is called at xfs initialisation time to set up the
+ * memory zone that will be used for filestream data structure allocation.
+ */
+int
+xfs_filestream_init(void)
+{
+ item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
+ if (!item_zone)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * xfs_filestream_uninit() is called at xfs termination time to destroy the
+ * memory zone that was used for filestream data structure allocation.
+ */
+void
+xfs_filestream_uninit(void)
+{
+ kmem_zone_destroy(item_zone);
+}
+
+/*
+ * xfs_filestream_mount() is called when a file system is mounted with the
+ * filestream option. It is responsible for allocating the data structures
+ * needed to track the new file system's file streams.
+ */
+int
+xfs_filestream_mount(
+ xfs_mount_t *mp)
+{
+ int err;
+ unsigned int lifetime, grp_count;
+
+ /*
+ * The filestream timer tunable is currently fixed within the range of
+ * one second to four minutes, with five seconds being the default. The
+ * group count is somewhat arbitrary, but it'd be nice to adhere to the
+ * timer tunable to within about 10 percent. This requires at least 10
+ * groups.
+ */
+ lifetime = xfs_fstrm_centisecs * 10;
+ grp_count = 10;
+
+ err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
+ xfs_fstrm_free_func);
+
+ return err;
+}
+
+/*
+ * xfs_filestream_unmount() is called when a file system that was mounted with
+ * the filestream option is unmounted. It drains the data structures created
+ * to track the file system's file streams and frees all the memory that was
+ * allocated.
+ */
+void
+xfs_filestream_unmount(
+ xfs_mount_t *mp)
+{
+ xfs_mru_cache_destroy(mp->m_filestream);
+}
+
+/*
+ * Return the AG of the filestream the file or directory belongs to, or
+ * NULLAGNUMBER otherwise.
+ */
+xfs_agnumber_t
+xfs_filestream_lookup_ag(
+ xfs_inode_t *ip)
+{
+ xfs_mru_cache_t *cache;
+ fstrm_item_t *item;
+ xfs_agnumber_t ag;
+ int ref;
+
+ if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) {
+ ASSERT(0);
+ return NULLAGNUMBER;
+ }
+
+ cache = ip->i_mount->m_filestream;
+ item = xfs_mru_cache_lookup(cache, ip->i_ino);
+ if (!item) {
+ TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0);
+ return NULLAGNUMBER;
+ }
+
+ ASSERT(ip == item->ip);
+ ag = item->ag;
+ ref = xfs_filestream_peek_ag(ip->i_mount, ag);
+ xfs_mru_cache_done(cache);
+
+ TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref);
+ return ag;
+}
+
+/*
+ * xfs_filestream_associate() should only be called to associate a regular file
+ * with its parent directory. Calling it with a child directory isn't
+ * appropriate because filestreams don't apply to entire directory hierarchies.
+ * Creating a file in a child directory of an existing filestream directory
+ * starts a new filestream with its own allocation group association.
+ *
+ * Returns < 0 on error, 0 if successful association occurred, > 0 if
+ * we failed to get an association because of locking issues.
+ */
+int
+xfs_filestream_associate(
+ xfs_inode_t *pip,
+ xfs_inode_t *ip)
+{
+ xfs_mount_t *mp;
+ xfs_mru_cache_t *cache;
+ fstrm_item_t *item;
+ xfs_agnumber_t ag, rotorstep, startag;
+ int err = 0;
+
+ ASSERT(pip->i_d.di_mode & S_IFDIR);
+ ASSERT(ip->i_d.di_mode & S_IFREG);
+ if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG))
+ return -EINVAL;
+
+ mp = pip->i_mount;
+ cache = mp->m_filestream;
+
+ /*
+ * We have a problem, Houston.
+ *
+ * Taking the iolock here violates inode locking order - we already
+ * hold the ilock. Hence if we block getting this lock we may never
+ * wake. Unfortunately, that means if we can't get the lock, we're
+ * screwed in terms of getting a stream association - we can't spin
+ * waiting for the lock because someone else is waiting on the lock we
+ * hold and we cannot drop that as we are in a transaction here.
+ *
+ * Lucky for us, this inversion is not a problem because it's a
+ * directory inode that we are trying to lock here.
+ *
+ * So, if we can't get the iolock without sleeping then just give up
+ */
+ if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
+ return 1;
+
+ /* If the parent directory is already in the cache, use its AG. */
+ item = xfs_mru_cache_lookup(cache, pip->i_ino);
+ if (item) {
+ ASSERT(item->ip == pip);
+ ag = item->ag;
+ xfs_mru_cache_done(cache);
+
+ TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
+ err = _xfs_filestream_update_ag(ip, pip, ag);
+
+ goto exit;
+ }
+
+ /*
+ * Set the starting AG using the rotor for inode32, otherwise
+ * use the directory inode's AG.
+ */
+ if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+ rotorstep = xfs_rotorstep;
+ startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
+ mp->m_agfrotor = (mp->m_agfrotor + 1) %
+ (mp->m_sb.sb_agcount * rotorstep);
+ } else
+ startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
+
+ /* Pick a new AG for the parent inode starting at startag. */
+ err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
+ if (err || ag == NULLAGNUMBER)
+ goto exit_did_pick;
+
+ /* Associate the parent inode with the AG. */
+ err = _xfs_filestream_update_ag(pip, NULL, ag);
+ if (err)
+ goto exit_did_pick;
+
+ /* Associate the file inode with the AG. */
+ err = _xfs_filestream_update_ag(ip, pip, ag);
+ if (err)
+ goto exit_did_pick;
+
+ TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
+
+exit_did_pick:
+ /*
+ * If _xfs_filestream_pick_ag() returned a valid AG, remove the
+ * reference it took on it, since the file and directory will have taken
+ * their own now if they were successfully cached.
+ */
+ if (ag != NULLAGNUMBER)
+ xfs_filestream_put_ag(mp, ag);
+
+exit:
+ xfs_iunlock(pip, XFS_IOLOCK_EXCL);
+ return -err;
+}
+
+/*
+ * Pick a new allocation group for the current file and its file stream. This
+ * function is called by xfs_bmap_filestreams() with the mount point's per-ag
+ * lock held.
+ */
+int
+xfs_filestream_new_ag(
+ xfs_bmalloca_t *ap,
+ xfs_agnumber_t *agp)
+{
+ int flags, err;
+ xfs_inode_t *ip, *pip = NULL;
+ xfs_mount_t *mp;
+ xfs_mru_cache_t *cache;
+ xfs_extlen_t minlen;
+ fstrm_item_t *dir, *file;
+ xfs_agnumber_t ag = NULLAGNUMBER;
+
+ ip = ap->ip;
+ mp = ip->i_mount;
+ cache = mp->m_filestream;
+ minlen = ap->alen;
+ *agp = NULLAGNUMBER;
+
+ /*
+ * Look for the file in the cache, removing it if it's found. Doing
+ * this allows it to be held across the dir lookup that follows.
+ */
+ file = xfs_mru_cache_remove(cache, ip->i_ino);
+ if (file) {
+ ASSERT(ip == file->ip);
+
+ /* Save the file's parent inode and old AG number for later. */
+ pip = file->pip;
+ ag = file->ag;
+
+ /* Look for the file's directory in the cache. */
+ dir = xfs_mru_cache_lookup(cache, pip->i_ino);
+ if (dir) {
+ ASSERT(pip == dir->ip);
+
+ /*
+ * If the directory has already moved on to a new AG,
+ * use that AG as the new AG for the file. Don't
+ * forget to twiddle the AG refcounts to match the
+ * movement.
+ */
+ if (dir->ag != file->ag) {
+ xfs_filestream_put_ag(mp, file->ag);
+ xfs_filestream_get_ag(mp, dir->ag);
+ *agp = file->ag = dir->ag;
+ }
+
+ xfs_mru_cache_done(cache);
+ }
+
+ /*
+ * Put the file back in the cache. If this fails, the free
+ * function needs to be called to tidy up in the same way as if
+ * the item had simply expired from the cache.
+ */
+ err = xfs_mru_cache_insert(cache, ip->i_ino, file);
+ if (err) {
+ xfs_fstrm_free_func(ip->i_ino, file);
+ return err;
+ }
+
+ /*
+ * If the file's AG was moved to the directory's new AG, there's
+ * nothing more to be done.
+ */
+ if (*agp != NULLAGNUMBER) {
+ TRACE_MOVEAG(mp, ip, pip,
+ ag, xfs_filestream_peek_ag(mp, ag),
+ *agp, xfs_filestream_peek_ag(mp, *agp));
+ return 0;
+ }
+ }
+
+ /*
+ * If the file's parent directory is known, take its iolock in exclusive
+ * mode to prevent two sibling files from racing each other to migrate
+ * themselves and their parent to different AGs.
+ *
+ * Note that we lock the parent directory iolock inside the child
+ * iolock here. That's fine as we never hold both parent and child
+ * iolock in any other place. This is different from the ilock,
+ * which requires locking of the child after the parent for namespace
+ * operations.
+ */
+ if (pip)
+ xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+
+ /*
+ * A new AG needs to be found for the file. If the file's parent
+ * directory is also known, it will be moved to the new AG as well to
+ * ensure that files created inside it in future use the new AG.
+ */
+ ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
+ flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
+ (ap->low ? XFS_PICK_LOWSPACE : 0);
+
+ err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
+ if (err || *agp == NULLAGNUMBER)
+ goto exit;
+
+ /*
+ * If the file wasn't found in the file cache, then its parent directory
+ * inode isn't known. For this to have happened, the file must either
+ * be pre-existing, or it was created long enough ago that its cache
+ * entry has expired. This isn't the sort of usage that the filestreams
+ * allocator is trying to optimise, so there's no point trying to track
+ * its new AG somehow in the filestream data structures.
+ */
+ if (!pip) {
+ TRACE_ORPHAN(mp, ip, *agp);
+ goto exit;
+ }
+
+ /* Associate the parent inode with the AG. */
+ err = _xfs_filestream_update_ag(pip, NULL, *agp);
+ if (err)
+ goto exit;
+
+ /* Associate the file inode with the AG. */
+ err = _xfs_filestream_update_ag(ip, pip, *agp);
+ if (err)
+ goto exit;
+
+ TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0,
+ *agp, xfs_filestream_peek_ag(mp, *agp));
+
+exit:
+ /*
+ * If _xfs_filestream_pick_ag() returned a valid AG, remove the
+ * reference it took on it, since the file and directory will have taken
+ * their own now if they were successfully cached.
+ */
+ if (*agp != NULLAGNUMBER)
+ xfs_filestream_put_ag(mp, *agp);
+ else
+ *agp = 0;
+
+ if (pip)
+ xfs_iunlock(pip, XFS_IOLOCK_EXCL);
+
+ return err;
+}
+
+/*
+ * Remove an association between an inode and a filestream object.
+ * Typically this is done on last close of an unlinked file.
+ */
+void
+xfs_filestream_deassociate(
+ xfs_inode_t *ip)
+{
+ xfs_mru_cache_t *cache = ip->i_mount->m_filestream;
+
+ xfs_mru_cache_delete(cache, ip->i_ino);
+}
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
new file mode 100644
index 00000000..09dd9af4
--- /dev/null
+++ b/fs/xfs/xfs_filestream.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_FILESTREAM_H__
+#define __XFS_FILESTREAM_H__
+
+#ifdef __KERNEL__
+
+struct xfs_mount;
+struct xfs_inode;
+struct xfs_perag;
+struct xfs_bmalloca;
+
+#ifdef XFS_FILESTREAMS_TRACE
+#define XFS_FSTRM_KTRACE_INFO 1
+#define XFS_FSTRM_KTRACE_AGSCAN 2
+#define XFS_FSTRM_KTRACE_AGPICK1 3
+#define XFS_FSTRM_KTRACE_AGPICK2 4
+#define XFS_FSTRM_KTRACE_UPDATE 5
+#define XFS_FSTRM_KTRACE_FREE 6
+#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7
+#define XFS_FSTRM_KTRACE_ASSOCIATE 8
+#define XFS_FSTRM_KTRACE_MOVEAG 9
+#define XFS_FSTRM_KTRACE_ORPHAN 10
+
+#define XFS_FSTRM_KTRACE_SIZE 16384
+extern ktrace_t *xfs_filestreams_trace_buf;
+
+#endif
+
+/* allocation selection flags */
+typedef enum xfs_fstrm_alloc {
+ XFS_PICK_USERDATA = 1,
+ XFS_PICK_LOWSPACE = 2,
+} xfs_fstrm_alloc_t;
+
+/* prototypes for filestream.c */
+int xfs_filestream_init(void);
+void xfs_filestream_uninit(void);
+int xfs_filestream_mount(struct xfs_mount *mp);
+void xfs_filestream_unmount(struct xfs_mount *mp);
+xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
+int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
+void xfs_filestream_deassociate(struct xfs_inode *ip);
+int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
+
+
+/* filestreams for the inode? */
+static inline int
+xfs_inode_is_filestream(
+ struct xfs_inode *ip)
+{
+ return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
+ xfs_iflags_test(ip, XFS_IFILESTREAM) ||
+ (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
new file mode 100644
index 00000000..8f6fc1a9
--- /dev/null
+++ b/fs/xfs/xfs_fs.h
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 1995-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_FS_H__
+#define __XFS_FS_H__
+
+/*
+ * SGI's XFS filesystem's major stuff (constants, structures)
+ */
+
+/*
+ * Direct I/O attribute record used with XFS_IOC_DIOINFO
+ * d_miniosz is the min xfer size, xfer size multiple and file seek offset
+ * alignment.
+ */
+#ifndef HAVE_DIOATTR
+struct dioattr {
+ __u32 d_mem; /* data buffer memory alignment */
+ __u32 d_miniosz; /* min xfer size */
+ __u32 d_maxiosz; /* max xfer size */
+};
+#endif
+
+/*
+ * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
+ */
+#ifndef HAVE_FSXATTR
+struct fsxattr {
+ __u32 fsx_xflags; /* xflags field value (get/set) */
+ __u32 fsx_extsize; /* extsize field value (get/set)*/
+ __u32 fsx_nextents; /* nextents field value (get) */
+ __u32 fsx_projid; /* project identifier (get/set) */
+ unsigned char fsx_pad[12];
+};
+#endif
+
+/*
+ * Flags for the bs_xflags/fsx_xflags field
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_DIFLAG_s.
+ */
+#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */
+#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */
+#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */
+#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */
+#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */
+#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */
+#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */
+#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */
+#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */
+#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */
+#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
+#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
+#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */
+#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
+#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
+
+/*
+ * Structure for XFS_IOC_GETBMAP.
+ * On input, fill in bmv_offset and bmv_length of the first structure
+ * to indicate the area of interest in the file, and bmv_entries with
+ * the number of array elements given back. The first structure is
+ * updated on return to give the offset and length for the next call.
+ */
+#ifndef HAVE_GETBMAP
+struct getbmap {
+ __s64 bmv_offset; /* file offset of segment in blocks */
+ __s64 bmv_block; /* starting block (64-bit daddr_t) */
+ __s64 bmv_length; /* length of segment, blocks */
+ __s32 bmv_count; /* # of entries in array incl. 1st */
+ __s32 bmv_entries; /* # of entries filled in (output) */
+};
+#endif
+
+/*
+ * Structure for XFS_IOC_GETBMAPX. Fields bmv_offset through bmv_entries
+ * are used exactly as in the getbmap structure. The getbmapx structure
+ * has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field
+ * is only used for the first structure. It contains input flags
+ * specifying XFS_IOC_GETBMAPX actions. The bmv_oflags field is filled
+ * in by the XFS_IOC_GETBMAPX command for each returned structure after
+ * the first.
+ */
+#ifndef HAVE_GETBMAPX
+struct getbmapx {
+ __s64 bmv_offset; /* file offset of segment in blocks */
+ __s64 bmv_block; /* starting block (64-bit daddr_t) */
+ __s64 bmv_length; /* length of segment, blocks */
+ __s32 bmv_count; /* # of entries in array incl. 1st */
+ __s32 bmv_entries; /* # of entries filled in (output). */
+ __s32 bmv_iflags; /* input flags (1st structure) */
+ __s32 bmv_oflags; /* output flags (after 1st structure)*/
+ __s32 bmv_unused1; /* future use */
+ __s32 bmv_unused2; /* future use */
+};
+#endif
+
+/* bmv_iflags values - set by XFS_IOC_GETBMAPX caller. */
+#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */
+#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
+#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
+#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */
+#define BMV_IF_VALID \
+ (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \
+ BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
+
+/* bmv_oflags values - returned for each non-header segment */
+#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
+#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
+#define BMV_OF_LAST 0x4 /* segment is the last in the file */
+
+/*
+ * Structure for XFS_IOC_FSSETDM.
+ * For use by backup and restore programs to set the XFS on-disk inode
+ * fields di_dmevmask and di_dmstate. These must be set to exactly and
+ * only values previously obtained via xfs_bulkstat! (Specifically the
+ * xfs_bstat_t fields bs_dmevmask and bs_dmstate.)
+ */
+#ifndef HAVE_FSDMIDATA
+struct fsdmidata {
+ __u32 fsd_dmevmask; /* corresponds to di_dmevmask */
+ __u16 fsd_padding;
+ __u16 fsd_dmstate; /* corresponds to di_dmstate */
+};
+#endif
+
+/*
+ * File segment locking set data type for 64 bit access.
+ * Also used for all the RESV/FREE interfaces.
+ */
+typedef struct xfs_flock64 {
+ __s16 l_type;
+ __s16 l_whence;
+ __s64 l_start;
+ __s64 l_len; /* len == 0 means until end of file */
+ __s32 l_sysid;
+ __u32 l_pid;
+ __s32 l_pad[4]; /* reserve area */
+} xfs_flock64_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V1
+ */
+typedef struct xfs_fsop_geom_v1 {
+ __u32 blocksize; /* filesystem (data) block size */
+ __u32 rtextsize; /* realtime extent size */
+ __u32 agblocks; /* fsblocks in an AG */
+ __u32 agcount; /* number of allocation groups */
+ __u32 logblocks; /* fsblocks in the log */
+ __u32 sectsize; /* (data) sector size, bytes */
+ __u32 inodesize; /* inode size in bytes */
+ __u32 imaxpct; /* max allowed inode space(%) */
+ __u64 datablocks; /* fsblocks in data subvolume */
+ __u64 rtblocks; /* fsblocks in realtime subvol */
+ __u64 rtextents; /* rt extents in realtime subvol*/
+ __u64 logstart; /* starting fsblock of the log */
+ unsigned char uuid[16]; /* unique id of the filesystem */
+ __u32 sunit; /* stripe unit, fsblocks */
+ __u32 swidth; /* stripe width, fsblocks */
+ __s32 version; /* structure version */
+ __u32 flags; /* superblock version flags */
+ __u32 logsectsize; /* log sector size, bytes */
+ __u32 rtsectsize; /* realtime sector size, bytes */
+ __u32 dirblocksize; /* directory block size, bytes */
+} xfs_fsop_geom_v1_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY
+ */
+typedef struct xfs_fsop_geom {
+ __u32 blocksize; /* filesystem (data) block size */
+ __u32 rtextsize; /* realtime extent size */
+ __u32 agblocks; /* fsblocks in an AG */
+ __u32 agcount; /* number of allocation groups */
+ __u32 logblocks; /* fsblocks in the log */
+ __u32 sectsize; /* (data) sector size, bytes */
+ __u32 inodesize; /* inode size in bytes */
+ __u32 imaxpct; /* max allowed inode space(%) */
+ __u64 datablocks; /* fsblocks in data subvolume */
+ __u64 rtblocks; /* fsblocks in realtime subvol */
+ __u64 rtextents; /* rt extents in realtime subvol*/
+ __u64 logstart; /* starting fsblock of the log */
+ unsigned char uuid[16]; /* unique id of the filesystem */
+ __u32 sunit; /* stripe unit, fsblocks */
+ __u32 swidth; /* stripe width, fsblocks */
+ __s32 version; /* structure version */
+ __u32 flags; /* superblock version flags */
+ __u32 logsectsize; /* log sector size, bytes */
+ __u32 rtsectsize; /* realtime sector size, bytes */
+ __u32 dirblocksize; /* directory block size, bytes */
+ __u32 logsunit; /* log stripe unit, bytes */
+} xfs_fsop_geom_t;
+
+/* Output for XFS_FS_COUNTS */
+typedef struct xfs_fsop_counts {
+ __u64 freedata; /* free data section blocks */
+ __u64 freertx; /* free rt extents */
+ __u64 freeino; /* free inodes */
+ __u64 allocino; /* total allocated inodes */
+} xfs_fsop_counts_t;
+
+/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */
+typedef struct xfs_fsop_resblks {
+ __u64 resblks;
+ __u64 resblks_avail;
+} xfs_fsop_resblks_t;
+
+#define XFS_FSOP_GEOM_VERSION 0
+
+#define XFS_FSOP_GEOM_FLAGS_ATTR 0x0001 /* attributes in use */
+#define XFS_FSOP_GEOM_FLAGS_NLINK 0x0002 /* 32-bit nlink values */
+#define XFS_FSOP_GEOM_FLAGS_QUOTA 0x0004 /* quotas enabled */
+#define XFS_FSOP_GEOM_FLAGS_IALIGN 0x0008 /* inode alignment */
+#define XFS_FSOP_GEOM_FLAGS_DALIGN 0x0010 /* large data alignment */
+#define XFS_FSOP_GEOM_FLAGS_SHARED 0x0020 /* read-only shared */
+#define XFS_FSOP_GEOM_FLAGS_EXTFLG 0x0040 /* special extent flag */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2 0x0080 /* directory version 2 */
+#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
+#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
+#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
+#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
+
+
+/*
+ * Minimum and maximum sizes need for growth checks
+ */
+#define XFS_MIN_AG_BLOCKS 64
+#define XFS_MIN_LOG_BLOCKS 512ULL
+#define XFS_MAX_LOG_BLOCKS (1024 * 1024ULL)
+#define XFS_MIN_LOG_BYTES (10 * 1024 * 1024ULL)
+
+/* keep the maximum size under 2^31 by a small amount */
+#define XFS_MAX_LOG_BYTES \
+ ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
+
+/*
+ * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
+ */
+typedef struct xfs_growfs_data {
+ __u64 newblocks; /* new data subvol size, fsblocks */
+ __u32 imaxpct; /* new inode space percentage limit */
+} xfs_growfs_data_t;
+
+typedef struct xfs_growfs_log {
+ __u32 newblocks; /* new log size, fsblocks */
+ __u32 isint; /* 1 if new log is internal */
+} xfs_growfs_log_t;
+
+typedef struct xfs_growfs_rt {
+ __u64 newblocks; /* new realtime size, fsblocks */
+ __u32 extsize; /* new realtime extent size, fsblocks */
+} xfs_growfs_rt_t;
+
+
+/*
+ * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE
+ */
+typedef struct xfs_bstime {
+ time_t tv_sec; /* seconds */
+ __s32 tv_nsec; /* and nanoseconds */
+} xfs_bstime_t;
+
+typedef struct xfs_bstat {
+ __u64 bs_ino; /* inode number */
+ __u16 bs_mode; /* type and mode */
+ __u16 bs_nlink; /* number of links */
+ __u32 bs_uid; /* user id */
+ __u32 bs_gid; /* group id */
+ __u32 bs_rdev; /* device value */
+ __s32 bs_blksize; /* block size */
+ __s64 bs_size; /* file size */
+ xfs_bstime_t bs_atime; /* access time */
+ xfs_bstime_t bs_mtime; /* modify time */
+ xfs_bstime_t bs_ctime; /* inode change time */
+ int64_t bs_blocks; /* number of blocks */
+ __u32 bs_xflags; /* extended flags */
+ __s32 bs_extsize; /* extent size */
+ __s32 bs_extents; /* number of extents */
+ __u32 bs_gen; /* generation count */
+ __u16 bs_projid_lo; /* lower part of project id */
+#define bs_projid bs_projid_lo /* (previously just bs_projid) */
+ __u16 bs_forkoff; /* inode fork offset in bytes */
+ __u16 bs_projid_hi; /* higher part of project id */
+ unsigned char bs_pad[10]; /* pad space, unused */
+ __u32 bs_dmevmask; /* DMIG event mask */
+ __u16 bs_dmstate; /* DMIG state info */
+ __u16 bs_aextents; /* attribute number of extents */
+} xfs_bstat_t;
+
+/*
+ * The user-level BulkStat Request interface structure.
+ */
+typedef struct xfs_fsop_bulkreq {
+ __u64 __user *lastip; /* last inode # pointer */
+ __s32 icount; /* count of entries in buffer */
+ void __user *ubuffer;/* user buffer for inode desc. */
+ __s32 __user *ocount; /* output count pointer */
+} xfs_fsop_bulkreq_t;
+
+
+/*
+ * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
+ */
+typedef struct xfs_inogrp {
+ __u64 xi_startino; /* starting inode number */
+ __s32 xi_alloccount; /* # bits set in allocmask */
+ __u64 xi_allocmask; /* mask of allocated inodes */
+} xfs_inogrp_t;
+
+
+/*
+ * Error injection.
+ */
+typedef struct xfs_error_injection {
+ __s32 fd;
+ __s32 errtag;
+} xfs_error_injection_t;
+
+
+/*
+ * The user-level Handle Request interface structure.
+ */
+typedef struct xfs_fsop_handlereq {
+ __u32 fd; /* fd for FD_TO_HANDLE */
+ void __user *path; /* user pathname */
+ __u32 oflags; /* open flags */
+ void __user *ihandle;/* user supplied handle */
+ __u32 ihandlen; /* user supplied length */
+ void __user *ohandle;/* user buffer for handle */
+ __u32 __user *ohandlen;/* user buffer length */
+} xfs_fsop_handlereq_t;
+
+/*
+ * Compound structures for passing args through Handle Request interfaces
+ * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle
+ * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and
+ * XFS_IOC_ATTRMULTI_BY_HANDLE
+ */
+
+typedef struct xfs_fsop_setdm_handlereq {
+ struct xfs_fsop_handlereq hreq; /* handle information */
+ struct fsdmidata __user *data; /* DMAPI data */
+} xfs_fsop_setdm_handlereq_t;
+
+typedef struct xfs_attrlist_cursor {
+ __u32 opaque[4];
+} xfs_attrlist_cursor_t;
+
+typedef struct xfs_fsop_attrlist_handlereq {
+ struct xfs_fsop_handlereq hreq; /* handle interface structure */
+ struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */
+ __u32 flags; /* which namespace to use */
+ __u32 buflen; /* length of buffer supplied */
+ void __user *buffer; /* returned names */
+} xfs_fsop_attrlist_handlereq_t;
+
+typedef struct xfs_attr_multiop {
+ __u32 am_opcode;
+#define ATTR_OP_GET 1 /* return the indicated attr's value */
+#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE 3 /* remove the indicated attr */
+ __s32 am_error;
+ void __user *am_attrname;
+ void __user *am_attrvalue;
+ __u32 am_length;
+ __u32 am_flags;
+} xfs_attr_multiop_t;
+
+typedef struct xfs_fsop_attrmulti_handlereq {
+ struct xfs_fsop_handlereq hreq; /* handle interface structure */
+ __u32 opcount;/* count of following multiop */
+ struct xfs_attr_multiop __user *ops; /* attr_multi data */
+} xfs_fsop_attrmulti_handlereq_t;
+
+/*
+ * per machine unique filesystem identifier types.
+ */
+typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */
+
+typedef struct xfs_fid {
+ __u16 fid_len; /* length of remainder */
+ __u16 fid_pad;
+ __u32 fid_gen; /* generation number */
+ __u64 fid_ino; /* 64 bits inode number */
+} xfs_fid_t;
+
+typedef struct xfs_handle {
+ union {
+ __s64 align; /* force alignment of ha_fid */
+ xfs_fsid_t _ha_fsid; /* unique file system identifier */
+ } ha_u;
+ xfs_fid_t ha_fid; /* file system specific file ID */
+} xfs_handle_t;
+#define ha_fsid ha_u._ha_fsid
+
+#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.fid_pad \
+ - (char *) &(handle)) \
+ + (handle).ha_fid.fid_len)
+
+/*
+ * Flags for going down operation
+ */
+#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
+#define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
+#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
+
+/*
+ * ioctl commands that are used by Linux filesystems
+ */
+#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
+#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS
+#define XFS_IOC_GETVERSION FS_IOC_GETVERSION
+
+/*
+ * ioctl commands that replace IRIX fcntl()'s
+ * For 'documentation' purposed more than anything else,
+ * the "cmd #" field reflects the IRIX fcntl number.
+ */
+#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64)
+#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64)
+#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr)
+#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr)
+#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr)
+#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64)
+#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64)
+#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap)
+#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata)
+#define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64)
+#define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP64 _IOW ('X', 43, struct xfs_flock64)
+#define XFS_IOC_GETBMAPA _IOWR('X', 44, struct getbmap)
+#define XFS_IOC_FSGETXATTRA _IOR ('X', 45, struct fsxattr)
+/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */
+/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
+#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
+#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
+
+/*
+ * ioctl commands that replace IRIX syssgi()'s
+ */
+#define XFS_IOC_FSGEOMETRY_V1 _IOR ('X', 100, struct xfs_fsop_geom_v1)
+#define XFS_IOC_FSBULKSTAT _IOWR('X', 101, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE _IOWR('X', 102, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS _IOWR('X', 103, struct xfs_fsop_bulkreq)
+#define XFS_IOC_PATH_TO_FSHANDLE _IOWR('X', 104, struct xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE _IOWR('X', 105, struct xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE _IOWR('X', 106, struct xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE _IOWR('X', 107, struct xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE _IOWR('X', 108, struct xfs_fsop_handlereq)
+#define XFS_IOC_SWAPEXT _IOWR('X', 109, struct xfs_swapext)
+#define XFS_IOC_FSGROWFSDATA _IOW ('X', 110, struct xfs_growfs_data)
+#define XFS_IOC_FSGROWFSLOG _IOW ('X', 111, struct xfs_growfs_log)
+#define XFS_IOC_FSGROWFSRT _IOW ('X', 112, struct xfs_growfs_rt)
+#define XFS_IOC_FSCOUNTS _IOR ('X', 113, struct xfs_fsop_counts)
+#define XFS_IOC_SET_RESBLKS _IOWR('X', 114, struct xfs_fsop_resblks)
+#define XFS_IOC_GET_RESBLKS _IOR ('X', 115, struct xfs_fsop_resblks)
+#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection)
+#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
+/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
+/* XFS_IOC_FREEZE -- FIFREEZE 119 */
+/* XFS_IOC_THAW -- FITHAW 120 */
+#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
+#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
+#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
+#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom)
+#define XFS_IOC_GOINGDOWN _IOR ('X', 125, __uint32_t)
+/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
+
+
+#ifndef HAVE_BBMACROS
+/*
+ * Block I/O parameterization. A basic block (BB) is the lowest size of
+ * filesystem allocation, and must equal 512. Length units given to bio
+ * routines are in BB's.
+ */
+#define BBSHIFT 9
+#define BBSIZE (1<<BBSHIFT)
+#define BBMASK (BBSIZE-1)
+#define BTOBB(bytes) (((__u64)(bytes) + BBSIZE - 1) >> BBSHIFT)
+#define BTOBBT(bytes) ((__u64)(bytes) >> BBSHIFT)
+#define BBTOB(bbs) ((bbs) << BBSHIFT)
+#endif
+
+#endif /* __XFS_FS_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
new file mode 100644
index 00000000..9153d2c7
--- /dev/null
+++ b/fs/xfs/xfs_fsops.c
@@ -0,0 +1,671 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_fsops.h"
+#include "xfs_itable.h"
+#include "xfs_trans_space.h"
+#include "xfs_rtalloc.h"
+#include "xfs_rw.h"
+#include "xfs_filestream.h"
+#include "xfs_trace.h"
+
+/*
+ * File system operations
+ */
+
+int
+xfs_fs_geometry(
+ xfs_mount_t *mp,
+ xfs_fsop_geom_t *geo,
+ int new_version)
+{
+
+ memset(geo, 0, sizeof(*geo));
+
+ geo->blocksize = mp->m_sb.sb_blocksize;
+ geo->rtextsize = mp->m_sb.sb_rextsize;
+ geo->agblocks = mp->m_sb.sb_agblocks;
+ geo->agcount = mp->m_sb.sb_agcount;
+ geo->logblocks = mp->m_sb.sb_logblocks;
+ geo->sectsize = mp->m_sb.sb_sectsize;
+ geo->inodesize = mp->m_sb.sb_inodesize;
+ geo->imaxpct = mp->m_sb.sb_imax_pct;
+ geo->datablocks = mp->m_sb.sb_dblocks;
+ geo->rtblocks = mp->m_sb.sb_rblocks;
+ geo->rtextents = mp->m_sb.sb_rextents;
+ geo->logstart = mp->m_sb.sb_logstart;
+ ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid));
+ memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid));
+ if (new_version >= 2) {
+ geo->sunit = mp->m_sb.sb_unit;
+ geo->swidth = mp->m_sb.sb_width;
+ }
+ if (new_version >= 3) {
+ geo->version = XFS_FSOP_GEOM_VERSION;
+ geo->flags =
+ (xfs_sb_version_hasattr(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
+ (xfs_sb_version_hasnlink(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
+ (xfs_sb_version_hasquota(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
+ (xfs_sb_version_hasalign(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
+ (xfs_sb_version_hasdalign(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
+ (xfs_sb_version_hasshared(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
+ (xfs_sb_version_hasextflgbit(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
+ (xfs_sb_version_hasdirv2(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
+ (xfs_sb_version_hassector(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
+ (xfs_sb_version_hasasciici(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) |
+ (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
+ (xfs_sb_version_hasattr2(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
+ geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
+ mp->m_sb.sb_logsectsize : BBSIZE;
+ geo->rtsectsize = mp->m_sb.sb_blocksize;
+ geo->dirblocksize = mp->m_dirblksize;
+ }
+ if (new_version >= 4) {
+ geo->flags |=
+ (xfs_sb_version_haslogv2(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_LOGV2 : 0);
+ geo->logsunit = mp->m_sb.sb_logsunit;
+ }
+ return 0;
+}
+
+static int
+xfs_growfs_data_private(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_growfs_data_t *in) /* growfs data input struct */
+{
+ xfs_agf_t *agf;
+ xfs_agi_t *agi;
+ xfs_agnumber_t agno;
+ xfs_extlen_t agsize;
+ xfs_extlen_t tmpsize;
+ xfs_alloc_rec_t *arec;
+ struct xfs_btree_block *block;
+ xfs_buf_t *bp;
+ int bucket;
+ int dpct;
+ int error;
+ xfs_agnumber_t nagcount;
+ xfs_agnumber_t nagimax = 0;
+ xfs_rfsblock_t nb, nb_mod;
+ xfs_rfsblock_t new;
+ xfs_rfsblock_t nfree;
+ xfs_agnumber_t oagcount;
+ int pct;
+ xfs_trans_t *tp;
+
+ nb = in->newblocks;
+ pct = in->imaxpct;
+ if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
+ return XFS_ERROR(EINVAL);
+ if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
+ return error;
+ dpct = pct - mp->m_sb.sb_imax_pct;
+ bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+ XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+ BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+ if (!bp)
+ return EIO;
+ xfs_buf_relse(bp);
+
+ new = nb; /* use new as a temporary here */
+ nb_mod = do_div(new, mp->m_sb.sb_agblocks);
+ nagcount = new + (nb_mod != 0);
+ if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
+ nagcount--;
+ nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
+ if (nb < mp->m_sb.sb_dblocks)
+ return XFS_ERROR(EINVAL);
+ }
+ new = nb - mp->m_sb.sb_dblocks;
+ oagcount = mp->m_sb.sb_agcount;
+
+ /* allocate the new per-ag structures */
+ if (nagcount > oagcount) {
+ error = xfs_initialize_perag(mp, nagcount, &nagimax);
+ if (error)
+ return error;
+ }
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
+ XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ /*
+ * Write new AG headers to disk. Non-transactional, but written
+ * synchronously so they are completed prior to the growfs transaction
+ * being logged.
+ */
+ nfree = 0;
+ for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+ /*
+ * AG freelist header block
+ */
+ bp = xfs_buf_get(mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+ agf = XFS_BUF_TO_AGF(bp);
+ memset(agf, 0, mp->m_sb.sb_sectsize);
+ agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+ agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+ agf->agf_seqno = cpu_to_be32(agno);
+ if (agno == nagcount - 1)
+ agsize =
+ nb -
+ (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+ else
+ agsize = mp->m_sb.sb_agblocks;
+ agf->agf_length = cpu_to_be32(agsize);
+ agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
+ agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
+ agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+ agf->agf_flfirst = 0;
+ agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
+ agf->agf_flcount = 0;
+ tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
+ agf->agf_freeblks = cpu_to_be32(tmpsize);
+ agf->agf_longest = cpu_to_be32(tmpsize);
+ error = xfs_bwrite(mp, bp);
+ if (error) {
+ goto error0;
+ }
+ /*
+ * AG inode header block
+ */
+ bp = xfs_buf_get(mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), XBF_LOCK | XBF_MAPPED);
+ agi = XFS_BUF_TO_AGI(bp);
+ memset(agi, 0, mp->m_sb.sb_sectsize);
+ agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
+ agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
+ agi->agi_seqno = cpu_to_be32(agno);
+ agi->agi_length = cpu_to_be32(agsize);
+ agi->agi_count = 0;
+ agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
+ agi->agi_level = cpu_to_be32(1);
+ agi->agi_freecount = 0;
+ agi->agi_newino = cpu_to_be32(NULLAGINO);
+ agi->agi_dirino = cpu_to_be32(NULLAGINO);
+ for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+ agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+ error = xfs_bwrite(mp, bp);
+ if (error) {
+ goto error0;
+ }
+ /*
+ * BNO btree root block
+ */
+ bp = xfs_buf_get(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize),
+ XBF_LOCK | XBF_MAPPED);
+ block = XFS_BUF_TO_BLOCK(bp);
+ memset(block, 0, mp->m_sb.sb_blocksize);
+ block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
+ block->bb_level = 0;
+ block->bb_numrecs = cpu_to_be16(1);
+ block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+ arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
+ arec->ar_blockcount = cpu_to_be32(
+ agsize - be32_to_cpu(arec->ar_startblock));
+ error = xfs_bwrite(mp, bp);
+ if (error) {
+ goto error0;
+ }
+ /*
+ * CNT btree root block
+ */
+ bp = xfs_buf_get(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize),
+ XBF_LOCK | XBF_MAPPED);
+ block = XFS_BUF_TO_BLOCK(bp);
+ memset(block, 0, mp->m_sb.sb_blocksize);
+ block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
+ block->bb_level = 0;
+ block->bb_numrecs = cpu_to_be16(1);
+ block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+ arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
+ arec->ar_blockcount = cpu_to_be32(
+ agsize - be32_to_cpu(arec->ar_startblock));
+ nfree += be32_to_cpu(arec->ar_blockcount);
+ error = xfs_bwrite(mp, bp);
+ if (error) {
+ goto error0;
+ }
+ /*
+ * INO btree root block
+ */
+ bp = xfs_buf_get(mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize),
+ XBF_LOCK | XBF_MAPPED);
+ block = XFS_BUF_TO_BLOCK(bp);
+ memset(block, 0, mp->m_sb.sb_blocksize);
+ block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
+ block->bb_level = 0;
+ block->bb_numrecs = 0;
+ block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ error = xfs_bwrite(mp, bp);
+ if (error) {
+ goto error0;
+ }
+ }
+ xfs_trans_agblocks_delta(tp, nfree);
+ /*
+ * There are new blocks in the old last a.g.
+ */
+ if (new) {
+ /*
+ * Change the agi length.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+ if (error) {
+ goto error0;
+ }
+ ASSERT(bp);
+ agi = XFS_BUF_TO_AGI(bp);
+ be32_add_cpu(&agi->agi_length, new);
+ ASSERT(nagcount == oagcount ||
+ be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
+ xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+ /*
+ * Change agf length.
+ */
+ error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
+ if (error) {
+ goto error0;
+ }
+ ASSERT(bp);
+ agf = XFS_BUF_TO_AGF(bp);
+ be32_add_cpu(&agf->agf_length, new);
+ ASSERT(be32_to_cpu(agf->agf_length) ==
+ be32_to_cpu(agi->agi_length));
+
+ xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+ /*
+ * Free the new space.
+ */
+ error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno,
+ be32_to_cpu(agf->agf_length) - new), new);
+ if (error) {
+ goto error0;
+ }
+ }
+
+ /*
+ * Update changed superblock fields transactionally. These are not
+ * seen by the rest of the world until the transaction commit applies
+ * them atomically to the superblock.
+ */
+ if (nagcount > oagcount)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
+ if (nb > mp->m_sb.sb_dblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
+ nb - mp->m_sb.sb_dblocks);
+ if (nfree)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
+ if (dpct)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+ error = xfs_trans_commit(tp, 0);
+ if (error)
+ return error;
+
+ /* New allocation groups fully initialized, so update mount struct */
+ if (nagimax)
+ mp->m_maxagi = nagimax;
+ if (mp->m_sb.sb_imax_pct) {
+ __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+ do_div(icount, 100);
+ mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+ } else
+ mp->m_maxicount = 0;
+ xfs_set_low_space_thresholds(mp);
+
+ /* update secondary superblocks. */
+ for (agno = 1; agno < nagcount; agno++) {
+ error = xfs_read_buf(mp, mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ if (error) {
+ xfs_warn(mp,
+ "error %d reading secondary superblock for ag %d",
+ error, agno);
+ break;
+ }
+ xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
+ /*
+ * If we get an error writing out the alternate superblocks,
+ * just issue a warning and continue. The real work is
+ * already done and committed.
+ */
+ if (!(error = xfs_bwrite(mp, bp))) {
+ continue;
+ } else {
+ xfs_warn(mp,
+ "write error %d updating secondary superblock for ag %d",
+ error, agno);
+ break; /* no point in continuing */
+ }
+ }
+ return 0;
+
+ error0:
+ xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ return error;
+}
+
+static int
+xfs_growfs_log_private(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_growfs_log_t *in) /* growfs log input struct */
+{
+ xfs_extlen_t nb;
+
+ nb = in->newblocks;
+ if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
+ return XFS_ERROR(EINVAL);
+ if (nb == mp->m_sb.sb_logblocks &&
+ in->isint == (mp->m_sb.sb_logstart != 0))
+ return XFS_ERROR(EINVAL);
+ /*
+ * Moving the log is hard, need new interfaces to sync
+ * the log first, hold off all activity while moving it.
+ * Can have shorter or longer log in the same space,
+ * or transform internal to external log or vice versa.
+ */
+ return XFS_ERROR(ENOSYS);
+}
+
+/*
+ * protected versions of growfs function acquire and release locks on the mount
+ * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
+ * XFS_IOC_FSGROWFSRT
+ */
+
+
+int
+xfs_growfs_data(
+ xfs_mount_t *mp,
+ xfs_growfs_data_t *in)
+{
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return XFS_ERROR(EPERM);
+ if (!mutex_trylock(&mp->m_growlock))
+ return XFS_ERROR(EWOULDBLOCK);
+ error = xfs_growfs_data_private(mp, in);
+ mutex_unlock(&mp->m_growlock);
+ return error;
+}
+
+int
+xfs_growfs_log(
+ xfs_mount_t *mp,
+ xfs_growfs_log_t *in)
+{
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return XFS_ERROR(EPERM);
+ if (!mutex_trylock(&mp->m_growlock))
+ return XFS_ERROR(EWOULDBLOCK);
+ error = xfs_growfs_log_private(mp, in);
+ mutex_unlock(&mp->m_growlock);
+ return error;
+}
+
+/*
+ * exported through ioctl XFS_IOC_FSCOUNTS
+ */
+
+int
+xfs_fs_counts(
+ xfs_mount_t *mp,
+ xfs_fsop_counts_t *cnt)
+{
+ xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+ spin_lock(&mp->m_sb_lock);
+ cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+ cnt->freertx = mp->m_sb.sb_frextents;
+ cnt->freeino = mp->m_sb.sb_ifree;
+ cnt->allocino = mp->m_sb.sb_icount;
+ spin_unlock(&mp->m_sb_lock);
+ return 0;
+}
+
+/*
+ * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
+ *
+ * xfs_reserve_blocks is called to set m_resblks
+ * in the in-core mount table. The number of unused reserved blocks
+ * is kept in m_resblks_avail.
+ *
+ * Reserve the requested number of blocks if available. Otherwise return
+ * as many as possible to satisfy the request. The actual number
+ * reserved are returned in outval
+ *
+ * A null inval pointer indicates that only the current reserved blocks
+ * available should be returned no settings are changed.
+ */
+
+int
+xfs_reserve_blocks(
+ xfs_mount_t *mp,
+ __uint64_t *inval,
+ xfs_fsop_resblks_t *outval)
+{
+ __int64_t lcounter, delta, fdblks_delta;
+ __uint64_t request;
+
+ /* If inval is null, report current values and return */
+ if (inval == (__uint64_t *)NULL) {
+ if (!outval)
+ return EINVAL;
+ outval->resblks = mp->m_resblks;
+ outval->resblks_avail = mp->m_resblks_avail;
+ return 0;
+ }
+
+ request = *inval;
+
+ /*
+ * With per-cpu counters, this becomes an interesting
+ * problem. we needto work out if we are freeing or allocation
+ * blocks first, then we can do the modification as necessary.
+ *
+ * We do this under the m_sb_lock so that if we are near
+ * ENOSPC, we will hold out any changes while we work out
+ * what to do. This means that the amount of free space can
+ * change while we do this, so we need to retry if we end up
+ * trying to reserve more space than is available.
+ *
+ * We also use the xfs_mod_incore_sb() interface so that we
+ * don't have to care about whether per cpu counter are
+ * enabled, disabled or even compiled in....
+ */
+retry:
+ spin_lock(&mp->m_sb_lock);
+ xfs_icsb_sync_counters_locked(mp, 0);
+
+ /*
+ * If our previous reservation was larger than the current value,
+ * then move any unused blocks back to the free pool.
+ */
+ fdblks_delta = 0;
+ if (mp->m_resblks > request) {
+ lcounter = mp->m_resblks_avail - request;
+ if (lcounter > 0) { /* release unused blocks */
+ fdblks_delta = lcounter;
+ mp->m_resblks_avail -= lcounter;
+ }
+ mp->m_resblks = request;
+ } else {
+ __int64_t free;
+
+ free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+ if (!free)
+ goto out; /* ENOSPC and fdblks_delta = 0 */
+
+ delta = request - mp->m_resblks;
+ lcounter = free - delta;
+ if (lcounter < 0) {
+ /* We can't satisfy the request, just get what we can */
+ mp->m_resblks += free;
+ mp->m_resblks_avail += free;
+ fdblks_delta = -free;
+ } else {
+ fdblks_delta = -delta;
+ mp->m_resblks = request;
+ mp->m_resblks_avail += delta;
+ }
+ }
+out:
+ if (outval) {
+ outval->resblks = mp->m_resblks;
+ outval->resblks_avail = mp->m_resblks_avail;
+ }
+ spin_unlock(&mp->m_sb_lock);
+
+ if (fdblks_delta) {
+ /*
+ * If we are putting blocks back here, m_resblks_avail is
+ * already at its max so this will put it in the free pool.
+ *
+ * If we need space, we'll either succeed in getting it
+ * from the free block count or we'll get an enospc. If
+ * we get a ENOSPC, it means things changed while we were
+ * calculating fdblks_delta and so we should try again to
+ * see if there is anything left to reserve.
+ *
+ * Don't set the reserved flag here - we don't want to reserve
+ * the extra reserve blocks from the reserve.....
+ */
+ int error;
+ error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ fdblks_delta, 0);
+ if (error == ENOSPC)
+ goto retry;
+ }
+ return 0;
+}
+
+/*
+ * Dump a transaction into the log that contains no real change. This is needed
+ * to be able to make the log dirty or stamp the current tail LSN into the log
+ * during the covering operation.
+ *
+ * We cannot use an inode here for this - that will push dirty state back up
+ * into the VFS and then periodic inode flushing will prevent log covering from
+ * making progress. Hence we log a field in the superblock instead and use a
+ * synchronous transaction to ensure the superblock is immediately unpinned
+ * and can be written back.
+ */
+int
+xfs_fs_log_dummy(
+ xfs_mount_t *mp)
+{
+ xfs_trans_t *tp;
+ int error;
+
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
+ error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+ XFS_DEFAULT_LOG_COUNT);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ /* log the UUID because it is an unchanging field */
+ xfs_mod_sb(tp, XFS_SB_UUID);
+ xfs_trans_set_sync(tp);
+ return xfs_trans_commit(tp, 0);
+}
+
+int
+xfs_fs_goingdown(
+ xfs_mount_t *mp,
+ __uint32_t inflags)
+{
+ switch (inflags) {
+ case XFS_FSOP_GOING_FLAGS_DEFAULT: {
+ struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
+
+ if (sb && !IS_ERR(sb)) {
+ xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+ thaw_bdev(sb->s_bdev, sb);
+ }
+
+ break;
+ }
+ case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
+ xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+ break;
+ case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
+ xfs_force_shutdown(mp,
+ SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
+ break;
+ default:
+ return XFS_ERROR(EINVAL);
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
new file mode 100644
index 00000000..1b6a98b6
--- /dev/null
+++ b/fs/xfs/xfs_fsops.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_FSOPS_H__
+#define __XFS_FSOPS_H__
+
+extern int xfs_fs_geometry(xfs_mount_t *mp, xfs_fsop_geom_t *geo, int nversion);
+extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in);
+extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in);
+extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
+extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
+ xfs_fsop_resblks_t *outval);
+extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
+extern int xfs_fs_log_dummy(struct xfs_mount *mp);
+
+#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
new file mode 100644
index 00000000..84ebeec1
--- /dev/null
+++ b/fs/xfs/xfs_ialloc.c
@@ -0,0 +1,1563 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+
+
+/*
+ * Allocation group level functions.
+ */
+static inline int
+xfs_ialloc_cluster_alignment(
+ xfs_alloc_arg_t *args)
+{
+ if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+ args->mp->m_sb.sb_inoalignmt >=
+ XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+ return args->mp->m_sb.sb_inoalignmt;
+ return 1;
+}
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int /* error */
+xfs_inobt_lookup(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agino_t ino, /* starting inode of chunk */
+ xfs_lookup_t dir, /* <=, >=, == */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.i.ir_startino = ino;
+ cur->bc_rec.i.ir_freecount = 0;
+ cur->bc_rec.i.ir_free = 0;
+ return xfs_btree_lookup(cur, dir, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int /* error */
+xfs_inobt_update(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_inobt_rec_incore_t *irec) /* btree record */
+{
+ union xfs_btree_rec rec;
+
+ rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+ rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
+ return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int /* error */
+xfs_inobt_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_inobt_rec_incore_t *irec, /* btree record */
+ int *stat) /* output: success/failure */
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (!error && *stat == 1) {
+ irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+ irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+ irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+ }
+ return error;
+}
+
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+ struct xfs_btree_cur *cur,
+ struct xfs_agi *agi)
+{
+ if (cur->bc_nlevels == 1) {
+ xfs_inobt_rec_incore_t rec;
+ int freecount = 0;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+
+ do {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+
+ if (i) {
+ freecount += rec.ir_freecount;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ return error;
+ }
+ } while (i == 1);
+
+ if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+ ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+ }
+ return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi) 0
+#endif
+
+/*
+ * Initialise a new set of inodes.
+ */
+STATIC void
+xfs_ialloc_inode_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_agblock_t length,
+ unsigned int gen)
+{
+ struct xfs_buf *fbuf;
+ struct xfs_dinode *free;
+ int blks_per_cluster, nbufs, ninodes;
+ int version;
+ int i, j;
+ xfs_daddr_t d;
+
+ /*
+ * Loop over the new block(s), filling in the inodes.
+ * For small block sizes, manipulate the inodes in buffers
+ * which are multiples of the blocks size.
+ */
+ if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+ blks_per_cluster = 1;
+ nbufs = length;
+ ninodes = mp->m_sb.sb_inopblock;
+ } else {
+ blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+ mp->m_sb.sb_blocksize;
+ nbufs = length / blks_per_cluster;
+ ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+ }
+
+ /*
+ * Figure out what version number to use in the inodes we create.
+ * If the superblock version has caught up to the one that supports
+ * the new inode format, then use the new inode version. Otherwise
+ * use the old version so that old kernels will continue to be
+ * able to use the file system.
+ */
+ if (xfs_sb_version_hasnlink(&mp->m_sb))
+ version = 2;
+ else
+ version = 1;
+
+ for (j = 0; j < nbufs; j++) {
+ /*
+ * Get the block.
+ */
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+ fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize * blks_per_cluster,
+ XBF_LOCK);
+ ASSERT(fbuf);
+ ASSERT(!XFS_BUF_GETERROR(fbuf));
+
+ /*
+ * Initialize all inodes in this buffer and then log them.
+ *
+ * XXX: It would be much better if we had just one transaction
+ * to log a whole cluster of inodes instead of all the
+ * individual transactions causing a lot of log traffic.
+ */
+ xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+ for (i = 0; i < ninodes; i++) {
+ int ioffset = i << mp->m_sb.sb_inodelog;
+ uint isize = sizeof(struct xfs_dinode);
+
+ free = xfs_make_iptr(mp, fbuf, i);
+ free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ free->di_version = version;
+ free->di_gen = cpu_to_be32(gen);
+ free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+ }
+ xfs_trans_inode_alloc_buf(tp, fbuf);
+ }
+}
+
+/*
+ * Allocate new inodes in the allocation group specified by agbp.
+ * Return 0 for success, else error code.
+ */
+STATIC int /* error code or 0 */
+xfs_ialloc_ag_alloc(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_buf_t *agbp, /* alloc group buffer */
+ int *alloc)
+{
+ xfs_agi_t *agi; /* allocation group header */
+ xfs_alloc_arg_t args; /* allocation argument structure */
+ xfs_btree_cur_t *cur; /* inode btree cursor */
+ xfs_agnumber_t agno;
+ int error;
+ int i;
+ xfs_agino_t newino; /* new first inode's number */
+ xfs_agino_t newlen; /* new number of inodes */
+ xfs_agino_t thisino; /* current inode number, for loop */
+ int isaligned = 0; /* inode allocation at stripe unit */
+ /* boundary */
+ struct xfs_perag *pag;
+
+ args.tp = tp;
+ args.mp = tp->t_mountp;
+
+ /*
+ * Locking will ensure that we don't have two callers in here
+ * at one time.
+ */
+ newlen = XFS_IALLOC_INODES(args.mp);
+ if (args.mp->m_maxicount &&
+ args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+ return XFS_ERROR(ENOSPC);
+ args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+ /*
+ * First try to allocate inodes contiguous with the last-allocated
+ * chunk of inodes. If the filesystem is striped, this will fill
+ * an entire stripe unit with inodes.
+ */
+ agi = XFS_BUF_TO_AGI(agbp);
+ newino = be32_to_cpu(agi->agi_newino);
+ agno = be32_to_cpu(agi->agi_seqno);
+ args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+ XFS_IALLOC_BLOCKS(args.mp);
+ if (likely(newino != NULLAGINO &&
+ (args.agbno < be32_to_cpu(agi->agi_length)))) {
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.type = XFS_ALLOCTYPE_THIS_BNO;
+ args.mod = args.total = args.wasdel = args.isfl =
+ args.userdata = args.minalignslop = 0;
+ args.prod = 1;
+
+ /*
+ * We need to take into account alignment here to ensure that
+ * we don't modify the free list if we fail to have an exact
+ * block. If we don't have an exact match, and every oher
+ * attempt allocation attempt fails, we'll end up cancelling
+ * a dirty transaction and shutting down.
+ *
+ * For an exact allocation, alignment must be 1,
+ * however we need to take cluster alignment into account when
+ * fixing up the freelist. Use the minalignslop field to
+ * indicate that extra blocks might be required for alignment,
+ * but not to use them in the actual exact allocation.
+ */
+ args.alignment = 1;
+ args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+
+ /* Allow space for the inode btree to split. */
+ args.minleft = args.mp->m_in_maxlevels - 1;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ } else
+ args.fsbno = NULLFSBLOCK;
+
+ if (unlikely(args.fsbno == NULLFSBLOCK)) {
+ /*
+ * Set the alignment for the allocation.
+ * If stripe alignment is turned on then align at stripe unit
+ * boundary.
+ * If the cluster size is smaller than a filesystem block
+ * then we're doing I/O for inodes in filesystem block size
+ * pieces, so don't need alignment anyway.
+ */
+ isaligned = 0;
+ if (args.mp->m_sinoalign) {
+ ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+ args.alignment = args.mp->m_dalign;
+ isaligned = 1;
+ } else
+ args.alignment = xfs_ialloc_cluster_alignment(&args);
+ /*
+ * Need to figure out where to allocate the inode blocks.
+ * Ideally they should be spaced out through the a.g.
+ * For now, just allocate blocks up front.
+ */
+ args.agbno = be32_to_cpu(agi->agi_root);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ /*
+ * Allocate a fixed-size extent of inodes.
+ */
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.mod = args.total = args.wasdel = args.isfl =
+ args.userdata = args.minalignslop = 0;
+ args.prod = 1;
+ /*
+ * Allow space for the inode btree to split.
+ */
+ args.minleft = args.mp->m_in_maxlevels - 1;
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+
+ /*
+ * If stripe alignment is turned on, then try again with cluster
+ * alignment.
+ */
+ if (isaligned && args.fsbno == NULLFSBLOCK) {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.agbno = be32_to_cpu(agi->agi_root);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.alignment = xfs_ialloc_cluster_alignment(&args);
+ if ((error = xfs_alloc_vextent(&args)))
+ return error;
+ }
+
+ if (args.fsbno == NULLFSBLOCK) {
+ *alloc = 0;
+ return 0;
+ }
+ ASSERT(args.len == args.minlen);
+
+ /*
+ * Stamp and write the inode buffers.
+ *
+ * Seed the new inode cluster with a random generation number. This
+ * prevents short-term reuse of generation numbers if a chunk is
+ * freed and then immediately reallocated. We use random numbers
+ * rather than a linear progression to prevent the next generation
+ * number from being easily guessable.
+ */
+ xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
+ random32());
+
+ /*
+ * Convert the results.
+ */
+ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+ be32_add_cpu(&agi->agi_count, newlen);
+ be32_add_cpu(&agi->agi_freecount, newlen);
+ pag = xfs_perag_get(args.mp, agno);
+ pag->pagi_freecount += newlen;
+ xfs_perag_put(pag);
+ agi->agi_newino = cpu_to_be32(newino);
+
+ /*
+ * Insert records describing the new inode chunk into the btree.
+ */
+ cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
+ for (thisino = newino;
+ thisino < newino + newlen;
+ thisino += XFS_INODES_PER_CHUNK) {
+ cur->bc_rec.i.ir_startino = thisino;
+ cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
+ cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 0);
+ error = xfs_btree_insert(cur, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 1);
+ }
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ /*
+ * Log allocation group header fields
+ */
+ xfs_ialloc_log_agi(tp, agbp,
+ XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+ /*
+ * Modify/log superblock values for inode count and inode free count.
+ */
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
+ *alloc = 1;
+ return 0;
+}
+
+STATIC xfs_agnumber_t
+xfs_ialloc_next_ag(
+ xfs_mount_t *mp)
+{
+ xfs_agnumber_t agno;
+
+ spin_lock(&mp->m_agirotor_lock);
+ agno = mp->m_agirotor;
+ if (++mp->m_agirotor == mp->m_maxagi)
+ mp->m_agirotor = 0;
+ spin_unlock(&mp->m_agirotor_lock);
+
+ return agno;
+}
+
+/*
+ * Select an allocation group to look for a free inode in, based on the parent
+ * inode and then mode. Return the allocation group buffer.
+ */
+STATIC xfs_buf_t * /* allocation group buffer */
+xfs_ialloc_ag_select(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_ino_t parent, /* parent directory inode number */
+ mode_t mode, /* bits set to indicate file type */
+ int okalloc) /* ok to allocate more space */
+{
+ xfs_buf_t *agbp; /* allocation group header buffer */
+ xfs_agnumber_t agcount; /* number of ag's in the filesystem */
+ xfs_agnumber_t agno; /* current ag number */
+ int flags; /* alloc buffer locking flags */
+ xfs_extlen_t ineed; /* blocks needed for inode allocation */
+ xfs_extlen_t longest = 0; /* longest extent available */
+ xfs_mount_t *mp; /* mount point structure */
+ int needspace; /* file mode implies space allocated */
+ xfs_perag_t *pag; /* per allocation group data */
+ xfs_agnumber_t pagno; /* parent (starting) ag number */
+
+ /*
+ * Files of these types need at least one block if length > 0
+ * (and they won't fit in the inode, but that's hard to figure out).
+ */
+ needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+ mp = tp->t_mountp;
+ agcount = mp->m_maxagi;
+ if (S_ISDIR(mode))
+ pagno = xfs_ialloc_next_ag(mp);
+ else {
+ pagno = XFS_INO_TO_AGNO(mp, parent);
+ if (pagno >= agcount)
+ pagno = 0;
+ }
+ ASSERT(pagno < agcount);
+ /*
+ * Loop through allocation groups, looking for one with a little
+ * free space in it. Note we don't look for free inodes, exactly.
+ * Instead, we include whether there is a need to allocate inodes
+ * to mean that blocks must be allocated for them,
+ * if none are currently free.
+ */
+ agno = pagno;
+ flags = XFS_ALLOC_FLAG_TRYLOCK;
+ for (;;) {
+ pag = xfs_perag_get(mp, agno);
+ if (!pag->pagi_init) {
+ if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+ agbp = NULL;
+ goto nextag;
+ }
+ } else
+ agbp = NULL;
+
+ if (!pag->pagi_inodeok) {
+ xfs_ialloc_next_ag(mp);
+ goto unlock_nextag;
+ }
+
+ /*
+ * Is there enough free space for the file plus a block
+ * of inodes (if we need to allocate some)?
+ */
+ ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
+ if (ineed && !pag->pagf_init) {
+ if (agbp == NULL &&
+ xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+ agbp = NULL;
+ goto nextag;
+ }
+ (void)xfs_alloc_pagf_init(mp, tp, agno, flags);
+ }
+ if (!ineed || pag->pagf_init) {
+ if (ineed && !(longest = pag->pagf_longest))
+ longest = pag->pagf_flcount > 0;
+ if (!ineed ||
+ (pag->pagf_freeblks >= needspace + ineed &&
+ longest >= ineed &&
+ okalloc)) {
+ if (agbp == NULL &&
+ xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+ agbp = NULL;
+ goto nextag;
+ }
+ xfs_perag_put(pag);
+ return agbp;
+ }
+ }
+unlock_nextag:
+ if (agbp)
+ xfs_trans_brelse(tp, agbp);
+nextag:
+ xfs_perag_put(pag);
+ /*
+ * No point in iterating over the rest, if we're shutting
+ * down.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return NULL;
+ agno++;
+ if (agno >= agcount)
+ agno = 0;
+ if (agno == pagno) {
+ if (flags == 0)
+ return NULL;
+ flags = 0;
+ }
+ }
+}
+
+/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+ struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec,
+ int *done,
+ int left)
+{
+ int error;
+ int i;
+
+ if (left)
+ error = xfs_btree_decrement(cur, 0, &i);
+ else
+ error = xfs_btree_increment(cur, 0, &i);
+
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+
+ return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+ struct xfs_btree_cur *cur,
+ xfs_agino_t agino,
+ xfs_inobt_rec_incore_t *rec,
+ int *done,
+ int left)
+{
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+
+ return 0;
+}
+
+/*
+ * Visible inode allocation functions.
+ */
+
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * The arguments IO_agbp and alloc_done are defined to work within
+ * the constraint of one allocation per transaction.
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes. On the first call,
+ * IO_agbp should be set to NULL. If an inode is available,
+ * i.e., xfs_dialloc() did not need to do an allocation, an inode
+ * number is returned. In this case, IO_agbp would be set to the
+ * current ag_buf and alloc_done set to false.
+ * If an allocation needed to be done, xfs_dialloc would return
+ * the current ag_buf in IO_agbp and set alloc_done to true.
+ * The caller should then commit the current transaction, allocate a new
+ * transaction, and call xfs_dialloc() again, passing in the previous
+ * value of IO_agbp. IO_agbp should be held across the transactions.
+ * Since the agbp is locked across the two calls, the second call is
+ * guaranteed to have a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated. The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_ino_t parent, /* parent inode (directory) */
+ mode_t mode, /* mode bits for new inode */
+ int okalloc, /* ok to allocate more space */
+ xfs_buf_t **IO_agbp, /* in/out ag header's buffer */
+ boolean_t *alloc_done, /* true if we needed to replenish
+ inode freelist */
+ xfs_ino_t *inop) /* inode number allocated */
+{
+ xfs_agnumber_t agcount; /* number of allocation groups */
+ xfs_buf_t *agbp; /* allocation group header's buffer */
+ xfs_agnumber_t agno; /* allocation group number */
+ xfs_agi_t *agi; /* allocation group header structure */
+ xfs_btree_cur_t *cur; /* inode allocation btree cursor */
+ int error; /* error return value */
+ int i; /* result code */
+ int ialloced; /* inode allocation status */
+ int noroom = 0; /* no space for inode blk allocation */
+ xfs_ino_t ino; /* fs-relative inode to be returned */
+ /* REFERENCED */
+ int j; /* result code */
+ xfs_mount_t *mp; /* file system mount structure */
+ int offset; /* index of inode in chunk */
+ xfs_agino_t pagino; /* parent's AG relative inode # */
+ xfs_agnumber_t pagno; /* parent's AG number */
+ xfs_inobt_rec_incore_t rec; /* inode allocation record */
+ xfs_agnumber_t tagno; /* testing allocation group number */
+ xfs_btree_cur_t *tcur; /* temp cursor */
+ xfs_inobt_rec_incore_t trec; /* temp inode allocation record */
+ struct xfs_perag *pag;
+
+
+ if (*IO_agbp == NULL) {
+ /*
+ * We do not have an agbp, so select an initial allocation
+ * group for inode allocation.
+ */
+ agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+ /*
+ * Couldn't find an allocation group satisfying the
+ * criteria, give up.
+ */
+ if (!agbp) {
+ *inop = NULLFSINO;
+ return 0;
+ }
+ agi = XFS_BUF_TO_AGI(agbp);
+ ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+ } else {
+ /*
+ * Continue where we left off before. In this case, we
+ * know that the allocation group has free inodes.
+ */
+ agbp = *IO_agbp;
+ agi = XFS_BUF_TO_AGI(agbp);
+ ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+ ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
+ }
+ mp = tp->t_mountp;
+ agcount = mp->m_sb.sb_agcount;
+ agno = be32_to_cpu(agi->agi_seqno);
+ tagno = agno;
+ pagno = XFS_INO_TO_AGNO(mp, parent);
+ pagino = XFS_INO_TO_AGINO(mp, parent);
+
+ /*
+ * If we have already hit the ceiling of inode blocks then clear
+ * okalloc so we scan all available agi structures for a free
+ * inode.
+ */
+
+ if (mp->m_maxicount &&
+ mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
+ noroom = 1;
+ okalloc = 0;
+ }
+
+ /*
+ * Loop until we find an allocation group that either has free inodes
+ * or in which we can allocate some inodes. Iterate through the
+ * allocation groups upward, wrapping at the end.
+ */
+ *alloc_done = B_FALSE;
+ while (!agi->agi_freecount) {
+ /*
+ * Don't do anything if we're not supposed to allocate
+ * any blocks, just go on to the next ag.
+ */
+ if (okalloc) {
+ /*
+ * Try to allocate some new inodes in the allocation
+ * group.
+ */
+ if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
+ xfs_trans_brelse(tp, agbp);
+ if (error == ENOSPC) {
+ *inop = NULLFSINO;
+ return 0;
+ } else
+ return error;
+ }
+ if (ialloced) {
+ /*
+ * We successfully allocated some inodes, return
+ * the current context to the caller so that it
+ * can commit the current transaction and call
+ * us again where we left off.
+ */
+ ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
+ *alloc_done = B_TRUE;
+ *IO_agbp = agbp;
+ *inop = NULLFSINO;
+ return 0;
+ }
+ }
+ /*
+ * If it failed, give up on this ag.
+ */
+ xfs_trans_brelse(tp, agbp);
+ /*
+ * Go on to the next ag: get its ag header.
+ */
+nextag:
+ if (++tagno == agcount)
+ tagno = 0;
+ if (tagno == agno) {
+ *inop = NULLFSINO;
+ return noroom ? ENOSPC : 0;
+ }
+ pag = xfs_perag_get(mp, tagno);
+ if (pag->pagi_inodeok == 0) {
+ xfs_perag_put(pag);
+ goto nextag;
+ }
+ error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
+ xfs_perag_put(pag);
+ if (error)
+ goto nextag;
+ agi = XFS_BUF_TO_AGI(agbp);
+ ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+ }
+ /*
+ * Here with an allocation group that has a free inode.
+ * Reset agno since we may have chosen a new ag in the
+ * loop above.
+ */
+ agno = tagno;
+ *IO_agbp = NULL;
+ pag = xfs_perag_get(mp, agno);
+
+ restart_pagno:
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
+ /*
+ * If pagino is 0 (this is the root inode allocation) use newino.
+ * This must work because we've just allocated some.
+ */
+ if (!pagino)
+ pagino = be32_to_cpu(agi->agi_newino);
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
+
+ /*
+ * If in the same AG as the parent, try to get near the parent.
+ */
+ if (pagno == agno) {
+ int doneleft; /* done, to the left */
+ int doneright; /* done, to the right */
+ int searchdistance = 10;
+
+ error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_inobt_get_rec(cur, &rec, &j);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ if (rec.ir_freecount > 0) {
+ /*
+ * Found a free inode in the same chunk
+ * as the parent, done.
+ */
+ goto alloc_inode;
+ }
+
+
+ /*
+ * In the same AG as parent, but parent's chunk is full.
+ */
+
+ /* duplicate the cursor, search left & right simultaneously */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+
+ /*
+ * Skip to last blocks looked up if same parent inode.
+ */
+ if (pagino != NULLAGINO &&
+ pag->pagl_pagino == pagino &&
+ pag->pagl_leftrec != NULLAGINO &&
+ pag->pagl_rightrec != NULLAGINO) {
+ error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+ &trec, &doneleft, 1);
+ if (error)
+ goto error1;
+
+ error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+ &rec, &doneright, 0);
+ if (error)
+ goto error1;
+ } else {
+ /* search left with tcur, back up 1 record */
+ error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+ if (error)
+ goto error1;
+
+ /* search right with cur, go forward 1 record. */
+ error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+ if (error)
+ goto error1;
+ }
+
+ /*
+ * Loop until we find an inode chunk with a free inode.
+ */
+ while (!doneleft || !doneright) {
+ int useleft; /* using left inode chunk this time */
+
+ if (!--searchdistance) {
+ /*
+ * Not in range - save last search
+ * location and allocate a new inode
+ */
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto newino;
+ }
+
+ /* figure out the closer block if both are valid. */
+ if (!doneleft && !doneright) {
+ useleft = pagino -
+ (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+ rec.ir_startino - pagino;
+ } else {
+ useleft = !doneleft;
+ }
+
+ /* free inodes to the left? */
+ if (useleft && trec.ir_freecount) {
+ rec = trec;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = tcur;
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* free inodes to the right? */
+ if (!useleft && rec.ir_freecount) {
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* get next record to check */
+ if (useleft) {
+ error = xfs_ialloc_next_rec(tcur, &trec,
+ &doneleft, 1);
+ } else {
+ error = xfs_ialloc_next_rec(cur, &rec,
+ &doneright, 0);
+ }
+ if (error)
+ goto error1;
+ }
+
+ /*
+ * We've reached the end of the btree. because
+ * we are only searching a small chunk of the
+ * btree each search, there is obviously free
+ * inodes closer to the parent inode than we
+ * are now. restart the search again.
+ */
+ pag->pagl_pagino = NULLAGINO;
+ pag->pagl_leftrec = NULLAGINO;
+ pag->pagl_rightrec = NULLAGINO;
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ goto restart_pagno;
+ }
+
+ /*
+ * In a different AG from the parent.
+ * See if the most recently allocated block has any free.
+ */
+newino:
+ if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
+ error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error0;
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, &rec, &j);
+ if (error)
+ goto error0;
+
+ if (j == 1 && rec.ir_freecount > 0) {
+ /*
+ * The last chunk allocated in the group
+ * still has a free inode.
+ */
+ goto alloc_inode;
+ }
+ }
+ }
+
+ /*
+ * None left in the last group, search the whole AG
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ for (;;) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if (rec.ir_freecount > 0)
+ break;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ }
+
+alloc_inode:
+ offset = xfs_ialloc_find_free(&rec.ir_free);
+ ASSERT(offset >= 0);
+ ASSERT(offset < XFS_INODES_PER_CHUNK);
+ ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+ ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ goto error0;
+ be32_add_cpu(&agi->agi_freecount, -1);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+ pag->pagi_freecount--;
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+ xfs_perag_put(pag);
+ *inop = ino;
+ return 0;
+error1:
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+error0:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * Free disk inode. Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_ino_t inode, /* inode to be freed */
+ xfs_bmap_free_t *flist, /* extents to free */
+ int *delete, /* set if inode cluster was deleted */
+ xfs_ino_t *first_ino) /* first inode in deleted cluster */
+{
+ /* REFERENCED */
+ xfs_agblock_t agbno; /* block number containing inode */
+ xfs_buf_t *agbp; /* buffer containing allocation group header */
+ xfs_agino_t agino; /* inode number relative to allocation group */
+ xfs_agnumber_t agno; /* allocation group number */
+ xfs_agi_t *agi; /* allocation group header */
+ xfs_btree_cur_t *cur; /* inode btree cursor */
+ int error; /* error return value */
+ int i; /* result code */
+ int ilen; /* inodes in an inode cluster */
+ xfs_mount_t *mp; /* mount structure for filesystem */
+ int off; /* offset of inode in inode chunk */
+ xfs_inobt_rec_incore_t rec; /* btree record */
+ struct xfs_perag *pag;
+
+ mp = tp->t_mountp;
+
+ /*
+ * Break up inode number into its components.
+ */
+ agno = XFS_INO_TO_AGNO(mp, inode);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+ __func__, agno, mp->m_sb.sb_agcount);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agino = XFS_INO_TO_AGINO(mp, inode);
+ if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+ __func__, (unsigned long long)inode,
+ (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+ __func__, agbno, mp->m_sb.sb_agblocks);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ /*
+ * Get the allocation group header.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+ __func__, error);
+ return error;
+ }
+ agi = XFS_BUF_TO_AGI(agbp);
+ ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+ ASSERT(agbno < be32_to_cpu(agi->agi_length));
+ /*
+ * Initialize the cursor.
+ */
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
+
+ /*
+ * Look for the entry describing this inode.
+ */
+ if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
+ xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
+ __func__, error);
+ goto error0;
+ }
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
+ __func__, error);
+ goto error0;
+ }
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ /*
+ * Get the offset in the inode chunk.
+ */
+ off = agino - rec.ir_startino;
+ ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
+ ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
+ /*
+ * Mark the inode free & increment the count.
+ */
+ rec.ir_free |= XFS_INOBT_MASK(off);
+ rec.ir_freecount++;
+
+ /*
+ * When an inode cluster is free, it becomes eligible for removal
+ */
+ if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
+ (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
+
+ *delete = 1;
+ *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+
+ /*
+ * Remove the inode cluster from the AGI B+Tree, adjust the
+ * AGI and Superblock inode counts, and mark the disk space
+ * to be freed when the transaction is committed.
+ */
+ ilen = XFS_IALLOC_INODES(mp);
+ be32_add_cpu(&agi->agi_count, -ilen);
+ be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+ pag = xfs_perag_get(mp, agno);
+ pag->pagi_freecount -= ilen - 1;
+ xfs_perag_put(pag);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
+
+ if ((error = xfs_btree_delete(cur, &i))) {
+ xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
+ __func__, error);
+ goto error0;
+ }
+
+ xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
+ agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
+ XFS_IALLOC_BLOCKS(mp), flist, mp);
+ } else {
+ *delete = 0;
+
+ error = xfs_inobt_update(cur, &rec);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
+ __func__, error);
+ goto error0;
+ }
+
+ /*
+ * Change the inode free counts and log the ag/sb changes.
+ */
+ be32_add_cpu(&agi->agi_freecount, 1);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+ pag = xfs_perag_get(mp, agno);
+ pag->pagi_freecount++;
+ xfs_perag_put(pag);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
+ }
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+error0:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+STATIC int
+xfs_imap_lookup(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ xfs_agblock_t agbno,
+ xfs_agblock_t *chunk_agbno,
+ xfs_agblock_t *offset_agbno,
+ int flags)
+{
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ int error;
+ int i;
+
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error) {
+ xfs_alert(mp,
+ "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
+ __func__, error, agno);
+ return error;
+ }
+
+ /*
+ * Lookup the inode record for the given agino. If the record cannot be
+ * found, then it's an invalid inode number and we should abort. Once
+ * we have a record, we need to ensure it contains the inode number
+ * we are looking up.
+ */
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
+ if (!error) {
+ if (i)
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (!error && i == 0)
+ error = EINVAL;
+ }
+
+ xfs_trans_brelse(tp, agbp);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ if (error)
+ return error;
+
+ /* check that the returned record contains the required inode */
+ if (rec.ir_startino > agino ||
+ rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+ return EINVAL;
+
+ /* for untrusted inodes check it is allocated first */
+ if ((flags & XFS_IGET_UNTRUSTED) &&
+ (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+ return EINVAL;
+
+ *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+ *offset_agbno = agbno - *chunk_agbno;
+ return 0;
+}
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_ino_t ino, /* inode to locate */
+ struct xfs_imap *imap, /* location map structure */
+ uint flags) /* flags for inode btree lookup */
+{
+ xfs_agblock_t agbno; /* block number of inode in the alloc group */
+ xfs_agino_t agino; /* inode number within alloc group */
+ xfs_agnumber_t agno; /* allocation group number */
+ int blks_per_cluster; /* num blocks per inode cluster */
+ xfs_agblock_t chunk_agbno; /* first block in inode chunk */
+ xfs_agblock_t cluster_agbno; /* first block in inode cluster */
+ int error; /* error code */
+ int offset; /* index of inode in its buffer */
+ int offset_agbno; /* blks from chunk start to inode */
+
+ ASSERT(ino != NULLFSINO);
+
+ /*
+ * Split up the inode number into its parts.
+ */
+ agno = XFS_INO_TO_AGNO(mp, ino);
+ agino = XFS_INO_TO_AGINO(mp, ino);
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
+ ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+#ifdef DEBUG
+ /*
+ * Don't output diagnostic information for untrusted inodes
+ * as they can be invalid without implying corruption.
+ */
+ if (flags & XFS_IGET_UNTRUSTED)
+ return XFS_ERROR(EINVAL);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_alert(mp,
+ "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
+ __func__, agno, mp->m_sb.sb_agcount);
+ }
+ if (agbno >= mp->m_sb.sb_agblocks) {
+ xfs_alert(mp,
+ "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
+ __func__, (unsigned long long)agbno,
+ (unsigned long)mp->m_sb.sb_agblocks);
+ }
+ if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ xfs_alert(mp,
+ "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+ __func__, ino,
+ XFS_AGINO_TO_INO(mp, agno, agino));
+ }
+ xfs_stack_trace();
+#endif /* DEBUG */
+ return XFS_ERROR(EINVAL);
+ }
+
+ blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+
+ /*
+ * For bulkstat and handle lookups, we have an untrusted inode number
+ * that we have to verify is valid. We cannot do this just by reading
+ * the inode buffer as it may have been unlinked and removed leaving
+ * inodes in stale state on disk. Hence we have to do a btree lookup
+ * in all cases where an untrusted inode number is passed.
+ */
+ if (flags & XFS_IGET_UNTRUSTED) {
+ error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ &chunk_agbno, &offset_agbno, flags);
+ if (error)
+ return error;
+ goto out_map;
+ }
+
+ /*
+ * If the inode cluster size is the same as the blocksize or
+ * smaller we get to the buffer by simple arithmetics.
+ */
+ if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
+ offset = XFS_INO_TO_OFFSET(mp, ino);
+ ASSERT(offset < mp->m_sb.sb_inopblock);
+
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap->im_len = XFS_FSB_TO_BB(mp, 1);
+ imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+ return 0;
+ }
+
+ /*
+ * If the inode chunks are aligned then use simple maths to
+ * find the location. Otherwise we have to do a btree
+ * lookup to find the location.
+ */
+ if (mp->m_inoalign_mask) {
+ offset_agbno = agbno & mp->m_inoalign_mask;
+ chunk_agbno = agbno - offset_agbno;
+ } else {
+ error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ &chunk_agbno, &offset_agbno, flags);
+ if (error)
+ return error;
+ }
+
+out_map:
+ ASSERT(agbno >= chunk_agbno);
+ cluster_agbno = chunk_agbno +
+ ((offset_agbno / blks_per_cluster) * blks_per_cluster);
+ offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+ XFS_INO_TO_OFFSET(mp, ino);
+
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+ imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+
+ /*
+ * If the inode number maps to a block outside the bounds
+ * of the file system then return NULL rather than calling
+ * read_buf and panicing when we get an error from the
+ * driver.
+ */
+ if ((imap->im_blkno + imap->im_len) >
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+ xfs_alert(mp,
+ "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
+ __func__, (unsigned long long) imap->im_blkno,
+ (unsigned long long) imap->im_len,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+ return XFS_ERROR(EINVAL);
+ }
+ return 0;
+}
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+ xfs_mount_t *mp) /* file system mount structure */
+{
+ int level;
+ uint maxblocks;
+ uint maxleafents;
+ int minleafrecs;
+ int minnoderecs;
+
+ maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
+ XFS_INODES_PER_CHUNK_LOG;
+ minleafrecs = mp->m_alloc_mnr[0];
+ minnoderecs = mp->m_alloc_mnr[1];
+ maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+ for (level = 1; maxblocks > 1; level++)
+ maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+ mp->m_in_maxlevels = level;
+}
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_buf_t *bp, /* allocation group header buffer */
+ int fields) /* bitmask of fields to log */
+{
+ int first; /* first byte number */
+ int last; /* last byte number */
+ static const short offsets[] = { /* field starting offsets */
+ /* keep in sync with bit definitions */
+ offsetof(xfs_agi_t, agi_magicnum),
+ offsetof(xfs_agi_t, agi_versionnum),
+ offsetof(xfs_agi_t, agi_seqno),
+ offsetof(xfs_agi_t, agi_length),
+ offsetof(xfs_agi_t, agi_count),
+ offsetof(xfs_agi_t, agi_root),
+ offsetof(xfs_agi_t, agi_level),
+ offsetof(xfs_agi_t, agi_freecount),
+ offsetof(xfs_agi_t, agi_newino),
+ offsetof(xfs_agi_t, agi_dirino),
+ offsetof(xfs_agi_t, agi_unlinked),
+ sizeof(xfs_agi_t)
+ };
+#ifdef DEBUG
+ xfs_agi_t *agi; /* allocation group header */
+
+ agi = XFS_BUF_TO_AGI(bp);
+ ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+#endif
+ /*
+ * Compute byte offsets for the first and last fields.
+ */
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
+ /*
+ * Log the allocation group inode header buffer.
+ */
+ xfs_trans_log_buf(tp, bp, first, last);
+}
+
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+ struct xfs_agi *agi)
+{
+ int i;
+
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+ ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int
+xfs_read_agi(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_buf **bpp) /* allocation group hdr buf */
+{
+ struct xfs_agi *agi; /* allocation group header */
+ int agi_ok; /* agi is consistent */
+ int error;
+
+ ASSERT(agno != NULLAGNUMBER);
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, bpp);
+ if (error)
+ return error;
+
+ ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp));
+ agi = XFS_BUF_TO_AGI(*bpp);
+
+ /*
+ * Validate the magic number of the agi block.
+ */
+ agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
+ XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
+ be32_to_cpu(agi->agi_seqno) == agno;
+ if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+ XFS_RANDOM_IALLOC_READ_AGI))) {
+ XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
+ mp, agi);
+ xfs_trans_brelse(tp, *bpp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
+
+ xfs_check_agi_unlinked(agi);
+ return 0;
+}
+
+int
+xfs_ialloc_read_agi(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_buf **bpp) /* allocation group hdr buf */
+{
+ struct xfs_agi *agi; /* allocation group header */
+ struct xfs_perag *pag; /* per allocation group data */
+ int error;
+
+ error = xfs_read_agi(mp, tp, agno, bpp);
+ if (error)
+ return error;
+
+ agi = XFS_BUF_TO_AGI(*bpp);
+ pag = xfs_perag_get(mp, agno);
+ if (!pag->pagi_init) {
+ pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
+ pag->pagi_count = be32_to_cpu(agi->agi_count);
+ pag->pagi_init = 1;
+ }
+
+ /*
+ * It's possible for these to be out of sync if
+ * we are in the middle of a forced shutdown.
+ */
+ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+ XFS_FORCED_SHUTDOWN(mp));
+ xfs_perag_put(pag);
+ return 0;
+}
+
+/*
+ * Read in the agi to initialise the per-ag data in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_agnumber_t agno) /* allocation group number */
+{
+ xfs_buf_t *bp = NULL;
+ int error;
+
+ error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+ if (error)
+ return error;
+ if (bp)
+ xfs_trans_brelse(tp, bp);
+ return 0;
+}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
new file mode 100644
index 00000000..bb538547
--- /dev/null
+++ b/fs/xfs/xfs_ialloc.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_IALLOC_H__
+#define __XFS_IALLOC_H__
+
+struct xfs_buf;
+struct xfs_dinode;
+struct xfs_imap;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Allocation parameters for inode allocation.
+ */
+#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos
+#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks
+
+/*
+ * Move inodes in clusters of this size.
+ */
+#define XFS_INODE_BIG_CLUSTER_SIZE 8192
+#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size
+
+/*
+ * Make an inode pointer out of the buffer/offset.
+ */
+static inline struct xfs_dinode *
+xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
+{
+ return (xfs_dinode_t *)
+ (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
+}
+
+/*
+ * Find a free (set) bit in the inode bitmask.
+ */
+static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
+{
+ return xfs_lowbit64(*fp);
+}
+
+
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * To work within the constraint of one allocation per transaction,
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes. If an inode is
+ * available without an allocation, agbp would be set to the current
+ * agbp and alloc_done set to false.
+ * If an allocation needed to be done, agbp would be set to the
+ * inode header of the allocation group and alloc_done set to true.
+ * The caller should then commit the current transaction and allocate a new
+ * transaction. xfs_dialloc() should then be called again with
+ * the agbp value returned from the previous call.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated. The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ *
+ * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
+ */
+int /* error */
+xfs_dialloc(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t parent, /* parent inode (directory) */
+ mode_t mode, /* mode bits for new inode */
+ int okalloc, /* ok to allocate more space */
+ struct xfs_buf **agbp, /* buf for a.g. inode header */
+ boolean_t *alloc_done, /* an allocation was done to replenish
+ the free inodes */
+ xfs_ino_t *inop); /* inode number allocated */
+
+/*
+ * Free disk inode. Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int /* error */
+xfs_difree(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t inode, /* inode to be freed */
+ struct xfs_bmap_free *flist, /* extents to free */
+ int *delete, /* set if inode cluster was deleted */
+ xfs_ino_t *first_ino); /* first inode in deleted cluster */
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t ino, /* inode to locate */
+ struct xfs_imap *imap, /* location map structure */
+ uint flags); /* flags for inode btree lookup */
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+ struct xfs_mount *mp); /* file system mount structure */
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *bp, /* allocation group header buffer */
+ int fields); /* bitmask of fields to log */
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int /* error */
+xfs_ialloc_read_agi(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_buf **bpp); /* allocation group hdr buf */
+
+/*
+ * Read in the allocation group header to initialise the per-ag data
+ * in the mount structure
+ */
+int
+xfs_ialloc_pagi_init(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno); /* allocation group number */
+
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+ xfs_lookup_t dir, int *stat);
+
+/*
+ * Get the data from the pointed-to record.
+ */
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec, int *stat);
+
+#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
new file mode 100644
index 00000000..16921f55
--- /dev/null
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+
+
+STATIC int
+xfs_inobt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_inobt_mnr[level != 0];
+}
+
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.a.agbp, cur->bc_private.a.agno);
+}
+
+STATIC void
+xfs_inobt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *nptr,
+ int inc) /* level change */
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+
+ agi->agi_root = nptr->s;
+ be32_add_cpu(&agi->agi_level, inc);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
+}
+
+STATIC int
+xfs_inobt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int length,
+ int *stat)
+{
+ xfs_alloc_arg_t args; /* block allocation args */
+ int error; /* error return value */
+ xfs_agblock_t sbno = be32_to_cpu(start->s);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ memset(&args, 0, sizeof(args));
+ args.tp = cur->bc_tp;
+ args.mp = cur->bc_mp;
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+ args.minlen = 1;
+ args.maxlen = 1;
+ args.prod = 1;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+
+ error = xfs_alloc_vextent(&args);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+ if (args.fsbno == NULLFSBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+ ASSERT(args.len == 1);
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+
+ new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
+ *stat = 1;
+ return 0;
+}
+
+STATIC int
+xfs_inobt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ xfs_fsblock_t fsbno;
+ int error;
+
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
+ error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+ if (error)
+ return error;
+
+ xfs_trans_binval(cur->bc_tp, bp);
+ return error;
+}
+
+STATIC int
+xfs_inobt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_inobt_mxr[level != 0];
+}
+
+STATIC void
+xfs_inobt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->inobt.ir_startino = rec->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_rec_from_key(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ rec->inobt.ir_startino = key->inobt.ir_startino;
+}
+
+STATIC void
+xfs_inobt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+ rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+ rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
+}
+
+/*
+ * initial value of ptr for lookup
+ */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+
+ ptr->s = agi->agi_root;
+}
+
+STATIC __int64_t
+xfs_inobt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
+ cur->bc_rec.i.ir_startino;
+}
+
+#ifdef DEBUG
+STATIC int
+xfs_inobt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->inobt.ir_startino) <
+ be32_to_cpu(k2->inobt.ir_startino);
+}
+
+STATIC int
+xfs_inobt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+ be32_to_cpu(r2->inobt.ir_startino);
+}
+#endif /* DEBUG */
+
+#ifdef XFS_BTREE_TRACE
+ktrace_t *xfs_inobt_trace_buf;
+
+STATIC void
+xfs_inobt_trace_enter(
+ struct xfs_btree_cur *cur,
+ const char *func,
+ char *s,
+ int type,
+ int line,
+ __psunsigned_t a0,
+ __psunsigned_t a1,
+ __psunsigned_t a2,
+ __psunsigned_t a3,
+ __psunsigned_t a4,
+ __psunsigned_t a5,
+ __psunsigned_t a6,
+ __psunsigned_t a7,
+ __psunsigned_t a8,
+ __psunsigned_t a9,
+ __psunsigned_t a10)
+{
+ ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type,
+ (void *)func, (void *)s, NULL, (void *)cur,
+ (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+ (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+ (void *)a8, (void *)a9, (void *)a10);
+}
+
+STATIC void
+xfs_inobt_trace_cursor(
+ struct xfs_btree_cur *cur,
+ __uint32_t *s0,
+ __uint64_t *l0,
+ __uint64_t *l1)
+{
+ *s0 = cur->bc_private.a.agno;
+ *l0 = cur->bc_rec.i.ir_startino;
+ *l1 = cur->bc_rec.i.ir_free;
+}
+
+STATIC void
+xfs_inobt_trace_key(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key,
+ __uint64_t *l0,
+ __uint64_t *l1)
+{
+ *l0 = be32_to_cpu(key->inobt.ir_startino);
+ *l1 = 0;
+}
+
+STATIC void
+xfs_inobt_trace_record(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ __uint64_t *l0,
+ __uint64_t *l1,
+ __uint64_t *l2)
+{
+ *l0 = be32_to_cpu(rec->inobt.ir_startino);
+ *l1 = be32_to_cpu(rec->inobt.ir_freecount);
+ *l2 = be64_to_cpu(rec->inobt.ir_free);
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_inobt_ops = {
+ .rec_len = sizeof(xfs_inobt_rec_t),
+ .key_len = sizeof(xfs_inobt_key_t),
+
+ .dup_cursor = xfs_inobt_dup_cursor,
+ .set_root = xfs_inobt_set_root,
+ .alloc_block = xfs_inobt_alloc_block,
+ .free_block = xfs_inobt_free_block,
+ .get_minrecs = xfs_inobt_get_minrecs,
+ .get_maxrecs = xfs_inobt_get_maxrecs,
+ .init_key_from_rec = xfs_inobt_init_key_from_rec,
+ .init_rec_from_key = xfs_inobt_init_rec_from_key,
+ .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
+ .key_diff = xfs_inobt_key_diff,
+
+#ifdef DEBUG
+ .keys_inorder = xfs_inobt_keys_inorder,
+ .recs_inorder = xfs_inobt_recs_inorder,
+#endif
+
+#ifdef XFS_BTREE_TRACE
+ .trace_enter = xfs_inobt_trace_enter,
+ .trace_cursor = xfs_inobt_trace_cursor,
+ .trace_key = xfs_inobt_trace_key,
+ .trace_record = xfs_inobt_trace_record,
+#endif
+};
+
+/*
+ * Allocate a new inode btree cursor.
+ */
+struct xfs_btree_cur * /* new inode btree cursor */
+xfs_inobt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *agbp, /* buffer for agi structure */
+ xfs_agnumber_t agno) /* allocation group number */
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ cur->bc_btnum = XFS_BTNUM_INO;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+ cur->bc_ops = &xfs_inobt_ops;
+
+ cur->bc_private.a.agbp = agbp;
+ cur->bc_private.a.agno = agno;
+
+ return cur;
+}
+
+/*
+ * Calculate number of records in an inobt btree block.
+ */
+int
+xfs_inobt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= XFS_INOBT_BLOCK_LEN(mp);
+
+ if (leaf)
+ return blocklen / sizeof(xfs_inobt_rec_t);
+ return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
+}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
new file mode 100644
index 00000000..f782ad0c
--- /dev/null
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_IALLOC_BTREE_H__
+#define __XFS_IALLOC_BTREE_H__
+
+/*
+ * Inode map on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * There is a btree for the inode map per allocation group.
+ */
+#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
+
+typedef __uint64_t xfs_inofree_t;
+#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
+#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
+#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
+#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
+
+static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
+{
+ return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
+}
+
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec {
+ __be32 ir_startino; /* starting inode number */
+ __be32 ir_freecount; /* count of free inodes (set bits) */
+ __be64 ir_free; /* free inode mask */
+} xfs_inobt_rec_t;
+
+typedef struct xfs_inobt_rec_incore {
+ xfs_agino_t ir_startino; /* starting inode number */
+ __int32_t ir_freecount; /* count of free inodes (set bits) */
+ xfs_inofree_t ir_free; /* free inode mask */
+} xfs_inobt_rec_incore_t;
+
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key {
+ __be32 ir_startino; /* starting inode number */
+} xfs_inobt_key_t;
+
+/* btree pointer type */
+typedef __be32 xfs_inobt_ptr_t;
+
+/*
+ * block numbers in the AG.
+ */
+#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+
+/*
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
+ */
+#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
+ ((xfs_inobt_rec_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+ ((xfs_inobt_key_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_inobt_key_t)))
+
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_inobt_ptr_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_inobt_key_t) + \
+ ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+ struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+
+#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
new file mode 100644
index 00000000..ca752f05
--- /dev/null
+++ b/fs/xfs/xfs_iget.c
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_acl.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_quota.h"
+#include "xfs_utils.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_btree_trace.h"
+#include "xfs_trace.h"
+
+
+/*
+ * Define xfs inode iolock lockdep classes. We need to ensure that all active
+ * inodes are considered the same for lockdep purposes, including inodes that
+ * are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
+ * guarantee the locks are considered the same when there are multiple lock
+ * initialisation siteѕ. Also, define a reclaimable inode class so it is
+ * obvious in lockdep reports which class the report is against.
+ */
+static struct lock_class_key xfs_iolock_active;
+struct lock_class_key xfs_iolock_reclaimable;
+
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ struct xfs_inode *ip;
+
+ /*
+ * if this didn't occur in transactions, we could use
+ * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+ * code up to do this anyway.
+ */
+ ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+ if (!ip)
+ return NULL;
+ if (inode_init_always(mp->m_super, VFS_I(ip))) {
+ kmem_zone_free(xfs_inode_zone, ip);
+ return NULL;
+ }
+
+ ASSERT(atomic_read(&ip->i_iocount) == 0);
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ ASSERT(completion_done(&ip->i_flush));
+ ASSERT(ip->i_ino == 0);
+
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+ lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+ &xfs_iolock_active, "xfs_iolock_active");
+
+ /* initialise the xfs inode */
+ ip->i_ino = ino;
+ ip->i_mount = mp;
+ memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+ ip->i_afp = NULL;
+ memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+ ip->i_flags = 0;
+ ip->i_update_core = 0;
+ ip->i_delayed_blks = 0;
+ memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+ ip->i_size = 0;
+ ip->i_new_size = 0;
+
+ return ip;
+}
+
+STATIC void
+xfs_inode_free_callback(
+ struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ struct xfs_inode *ip = XFS_I(inode);
+
+ INIT_LIST_HEAD(&inode->i_dentry);
+ kmem_zone_free(xfs_inode_zone, ip);
+}
+
+void
+xfs_inode_free(
+ struct xfs_inode *ip)
+{
+ switch (ip->i_d.di_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ xfs_idestroy_fork(ip, XFS_DATA_FORK);
+ break;
+ }
+
+ if (ip->i_afp)
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+ if (ip->i_itemp) {
+ /*
+ * Only if we are shutting down the fs will we see an
+ * inode still in the AIL. If it is there, we should remove
+ * it to prevent a use-after-free from occurring.
+ */
+ xfs_log_item_t *lip = &ip->i_itemp->ili_item;
+ struct xfs_ail *ailp = lip->li_ailp;
+
+ ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+ XFS_FORCED_SHUTDOWN(ip->i_mount));
+ if (lip->li_flags & XFS_LI_IN_AIL) {
+ spin_lock(&ailp->xa_lock);
+ if (lip->li_flags & XFS_LI_IN_AIL)
+ xfs_trans_ail_delete(ailp, lip);
+ else
+ spin_unlock(&ailp->xa_lock);
+ }
+ xfs_inode_item_destroy(ip);
+ ip->i_itemp = NULL;
+ }
+
+ /* asserts to verify all state is correct here */
+ ASSERT(atomic_read(&ip->i_iocount) == 0);
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ ASSERT(completion_done(&ip->i_flush));
+
+ /*
+ * Because we use RCU freeing we need to ensure the inode always
+ * appears to be reclaimed with an invalid inode number when in the
+ * free state. The ip->i_flags_lock provides the barrier against lookup
+ * races.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags = XFS_IRECLAIM;
+ ip->i_ino = 0;
+ spin_unlock(&ip->i_flags_lock);
+
+ call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip,
+ xfs_ino_t ino,
+ int flags,
+ int lock_flags) __releases(RCU)
+{
+ struct inode *inode = VFS_I(ip);
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ /*
+ * check for re-use of an inode within an RCU grace period due to the
+ * radix tree nodes not being updated yet. We monitor for this by
+ * setting the inode number to zero before freeing the inode structure.
+ * If the inode has been reallocated and set up, then the inode number
+ * will not match, so check for that, too.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != ino) {
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+
+ /*
+ * If we are racing with another cache hit that is currently
+ * instantiating this inode or currently recycling it out of
+ * reclaimabe state, wait for the initialisation to complete
+ * before continuing.
+ *
+ * XXX(hch): eventually we should do something equivalent to
+ * wait_on_inode to wait for these flags to be cleared
+ * instead of polling for it.
+ */
+ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+ /*
+ * If lookup is racing with unlink return an error immediately.
+ */
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
+ }
+
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+ * Need to carefully get it back into useable state.
+ */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
+ trace_xfs_iget_reclaim(ip);
+
+ /*
+ * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+ * from stomping over us while we recycle the inode. We can't
+ * clear the radix tree reclaimable tag yet as it requires
+ * pag_ici_lock to be held exclusive.
+ */
+ ip->i_flags |= XFS_IRECLAIM;
+
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+
+ error = -inode_init_always(mp->m_super, inode);
+ if (error) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ rcu_read_lock();
+ spin_lock(&ip->i_flags_lock);
+
+ ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+ ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+ trace_xfs_iget_reclaim_fail(ip);
+ goto out_error;
+ }
+
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ /*
+ * Clear the per-lifetime state in the inode as we are now
+ * effectively a new inode and need to return to the initial
+ * state before reuse occurs.
+ */
+ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+ ip->i_flags |= XFS_INEW;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ inode->i_state = I_NEW;
+
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+ lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+ &xfs_iolock_active, "xfs_iolock_active");
+
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+ } else {
+ /* If the VFS inode is being torn down, pause and try again. */
+ if (!igrab(inode)) {
+ trace_xfs_iget_skip(ip);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+ /* We've got a live one. */
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ trace_xfs_iget_hit(ip);
+ }
+
+ if (lock_flags != 0)
+ xfs_ilock(ip, lock_flags);
+
+ xfs_iflags_clear(ip, XFS_ISTALE);
+ XFS_STATS_INC(xs_ig_found);
+
+ return 0;
+
+out_error:
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ return error;
+}
+
+
+static int
+xfs_iget_cache_miss(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ xfs_trans_t *tp,
+ xfs_ino_t ino,
+ struct xfs_inode **ipp,
+ int flags,
+ int lock_flags)
+{
+ struct xfs_inode *ip;
+ int error;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
+
+ ip = xfs_inode_alloc(mp, ino);
+ if (!ip)
+ return ENOMEM;
+
+ error = xfs_iread(mp, tp, ip, flags);
+ if (error)
+ goto out_destroy;
+
+ trace_xfs_iget_miss(ip);
+
+ if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_destroy;
+ }
+
+ /*
+ * Preload the radix tree so we can insert safely under the
+ * write spinlock. Note that we cannot sleep inside the preload
+ * region.
+ */
+ if (radix_tree_preload(GFP_KERNEL)) {
+ error = EAGAIN;
+ goto out_destroy;
+ }
+
+ /*
+ * Because the inode hasn't been added to the radix-tree yet it can't
+ * be found by another thread, so we can do the non-sleeping lock here.
+ */
+ if (lock_flags) {
+ if (!xfs_ilock_nowait(ip, lock_flags))
+ BUG();
+ }
+
+ /*
+ * These values must be set before inserting the inode into the radix
+ * tree as the moment it is inserted a concurrent lookup (allowed by the
+ * RCU locking mechanism) can find it and that lookup must see that this
+ * is an inode currently under construction (i.e. that XFS_INEW is set).
+ * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+ * memory barrier that ensures this detection works correctly at lookup
+ * time.
+ */
+ ip->i_udquot = ip->i_gdquot = NULL;
+ xfs_iflags_set(ip, XFS_INEW);
+
+ /* insert the new inode */
+ spin_lock(&pag->pag_ici_lock);
+ error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+ if (unlikely(error)) {
+ WARN_ON(error != -EEXIST);
+ XFS_STATS_INC(xs_ig_dup);
+ error = EAGAIN;
+ goto out_preload_end;
+ }
+ spin_unlock(&pag->pag_ici_lock);
+ radix_tree_preload_end();
+
+ *ipp = ip;
+ return 0;
+
+out_preload_end:
+ spin_unlock(&pag->pag_ici_lock);
+ radix_tree_preload_end();
+ if (lock_flags)
+ xfs_iunlock(ip, lock_flags);
+out_destroy:
+ __destroy_inode(VFS_I(ip));
+ xfs_inode_free(ip);
+ return error;
+}
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system. It points
+ * to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one. This is
+ * simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired. This is the unique identifier
+ * within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode. See the comment
+ * for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ xfs_ino_t ino,
+ uint flags,
+ uint lock_flags,
+ xfs_inode_t **ipp)
+{
+ xfs_inode_t *ip;
+ int error;
+ xfs_perag_t *pag;
+ xfs_agino_t agino;
+
+ /* reject inode numbers outside existing AGs */
+ if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+ return EINVAL;
+
+ /* get the perag structure and ensure that it's inode capable */
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+ agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+ error = 0;
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+ if (ip) {
+ error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+ if (error)
+ goto out_error_or_again;
+ } else {
+ rcu_read_unlock();
+ XFS_STATS_INC(xs_ig_missed);
+
+ error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+ flags, lock_flags);
+ if (error)
+ goto out_error_or_again;
+ }
+ xfs_perag_put(pag);
+
+ *ipp = ip;
+
+ ASSERT(ip->i_df.if_ext_max ==
+ XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
+ /*
+ * If we have a real type for an on-disk inode, we can set ops(&unlock)
+ * now. If it's a new inode being created, xfs_ialloc will handle it.
+ */
+ if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+ xfs_setup_inode(ip);
+ return 0;
+
+out_error_or_again:
+ if (error == EAGAIN) {
+ delay(1);
+ goto again;
+ }
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * This is a wrapper routine around the xfs_ilock() routine
+ * used to centralize some grungy code. It is used in places
+ * that wish to lock the inode solely for reading the extents.
+ * The reason these places can't just call xfs_ilock(SHARED)
+ * is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format. If the inode
+ * is in b-tree format, then we need to lock the inode exclusively
+ * until the extents are read in. Locking it exclusively all
+ * the time would limit our parallelism unnecessarily, though.
+ * What we do instead is check to see if the extents have been
+ * read in yet, and only lock the inode exclusively if they
+ * have not.
+ *
+ * The function returns a value which should be given to the
+ * corresponding xfs_iunlock_map_shared(). This value is
+ * the mode in which the lock was actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+ xfs_inode_t *ip)
+{
+ uint lock_mode;
+
+ if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+ ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ } else {
+ lock_mode = XFS_ILOCK_SHARED;
+ }
+
+ xfs_ilock(ip, lock_mode);
+
+ return lock_mode;
+}
+
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+ xfs_inode_t *ip,
+ unsigned int lock_mode)
+{
+ xfs_iunlock(ip, lock_mode);
+}
+
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock. This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ * to be locked. It can be:
+ * XFS_IOLOCK_SHARED,
+ * XFS_IOLOCK_EXCL,
+ * XFS_ILOCK_SHARED,
+ * XFS_ILOCK_EXCL,
+ * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ /*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+ else if (lock_flags & XFS_ILOCK_SHARED)
+ mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+
+ trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep. It returns 1 if it gets
+ * the requested locks and 0 otherwise. If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ * to be locked. See the comment for xfs_ilock() for a list
+ * of valid values.
+ */
+int
+xfs_ilock_nowait(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ /*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+ if (lock_flags & XFS_IOLOCK_EXCL) {
+ if (!mrtryupdate(&ip->i_iolock))
+ goto out;
+ } else if (lock_flags & XFS_IOLOCK_SHARED) {
+ if (!mrtryaccess(&ip->i_iolock))
+ goto out;
+ }
+ if (lock_flags & XFS_ILOCK_EXCL) {
+ if (!mrtryupdate(&ip->i_lock))
+ goto out_undo_iolock;
+ } else if (lock_flags & XFS_ILOCK_SHARED) {
+ if (!mrtryaccess(&ip->i_lock))
+ goto out_undo_iolock;
+ }
+ trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+ return 1;
+
+ out_undo_iolock:
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrunlock_excl(&ip->i_iolock);
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mrunlock_shared(&ip->i_iolock);
+ out:
+ return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ * to be unlocked. See the comment for xfs_ilock() for a list
+ * of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ /*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY |
+ XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT(lock_flags != 0);
+
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrunlock_excl(&ip->i_iolock);
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mrunlock_shared(&ip->i_iolock);
+
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrunlock_excl(&ip->i_lock);
+ else if (lock_flags & XFS_ILOCK_SHARED)
+ mrunlock_shared(&ip->i_lock);
+
+ if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) &&
+ !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) {
+ /*
+ * Let the AIL know that this item has been unlocked in case
+ * it is in the AIL and anyone is waiting on it. Don't do
+ * this if the caller has asked us not to.
+ */
+ xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
+ (xfs_log_item_t*)(ip->i_itemp));
+ }
+ trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * give up write locks. the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrdemote(&ip->i_lock);
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrdemote(&ip->i_iolock);
+
+ trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+
+#ifdef DEBUG
+int
+xfs_isilocked(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+ if (!(lock_flags & XFS_ILOCK_SHARED))
+ return !!ip->i_lock.mr_writer;
+ return rwsem_is_locked(&ip->i_lock.mr_lock);
+ }
+
+ if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+ if (!(lock_flags & XFS_IOLOCK_SHARED))
+ return !!ip->i_iolock.mr_writer;
+ return rwsem_is_locked(&ip->i_iolock.mr_lock);
+ }
+
+ ASSERT(0);
+ return 0;
+}
+#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
new file mode 100644
index 00000000..57152799
--- /dev/null
+++ b/fs/xfs/xfs_inode.c
@@ -0,0 +1,4145 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <linux/log2.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_utils.h"
+#include "xfs_quota.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+kmem_zone_t *xfs_ifork_zone;
+kmem_zone_t *xfs_inode_zone;
+
+/*
+ * Used in xfs_itruncate(). This is the maximum number of extents
+ * freed from a file in a single transaction.
+ */
+#define XFS_ITRUNC_MAX_EXTENTS 2
+
+STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+STATIC void
+xfs_validate_extents(
+ xfs_ifork_t *ifp,
+ int nrecs,
+ xfs_exntfmt_t fmt)
+{
+ xfs_bmbt_irec_t irec;
+ xfs_bmbt_rec_host_t rec;
+ int i;
+
+ for (i = 0; i < nrecs; i++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+ rec.l0 = get_unaligned(&ep->l0);
+ rec.l1 = get_unaligned(&ep->l1);
+ xfs_bmbt_get_all(&rec, &irec);
+ if (fmt == XFS_EXTFMT_NOSTATE)
+ ASSERT(irec.br_state == XFS_EXT_NORM);
+ }
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ifp, nrecs, fmt)
+#endif /* DEBUG */
+
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+ xfs_mount_t *mp,
+ xfs_buf_t *bp)
+{
+ int i;
+ int j;
+ xfs_dinode_t *dip;
+
+ j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+ for (i = 0; i < j; i++) {
+ dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+ i * mp->m_sb.sb_inodesize);
+ if (!dip->di_next_unlinked) {
+ xfs_alert(mp,
+ "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
+ bp);
+ ASSERT(dip->di_next_unlinked);
+ }
+ }
+}
+#endif
+
+/*
+ * Find the buffer associated with the given inode map
+ * We do basic validation checks on the buffer once it has been
+ * retrieved from disk.
+ */
+STATIC int
+xfs_imap_to_bp(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ struct xfs_imap *imap,
+ xfs_buf_t **bpp,
+ uint buf_flags,
+ uint iget_flags)
+{
+ int error;
+ int i;
+ int ni;
+ xfs_buf_t *bp;
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+ (int)imap->im_len, buf_flags, &bp);
+ if (error) {
+ if (error != EAGAIN) {
+ xfs_warn(mp,
+ "%s: xfs_trans_read_buf() returned error %d.",
+ __func__, error);
+ } else {
+ ASSERT(buf_flags & XBF_TRYLOCK);
+ }
+ return error;
+ }
+
+ /*
+ * Validate the magic number and version of every inode in the buffer
+ * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+ */
+#ifdef DEBUG
+ ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
+#else /* usual case */
+ ni = 1;
+#endif
+
+ for (i = 0; i < ni; i++) {
+ int di_ok;
+ xfs_dinode_t *dip;
+
+ dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+ (i << mp->m_sb.sb_inodelog));
+ di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC &&
+ XFS_DINODE_GOOD_VERSION(dip->di_version);
+ if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+ XFS_ERRTAG_ITOBP_INOTOBP,
+ XFS_RANDOM_ITOBP_INOTOBP))) {
+ if (iget_flags & XFS_IGET_UNTRUSTED) {
+ xfs_trans_brelse(tp, bp);
+ return XFS_ERROR(EINVAL);
+ }
+ XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
+ XFS_ERRLEVEL_HIGH, mp, dip);
+#ifdef DEBUG
+ xfs_emerg(mp,
+ "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+ (unsigned long long)imap->im_blkno, i,
+ be16_to_cpu(dip->di_magic));
+ ASSERT(0);
+#endif
+ xfs_trans_brelse(tp, bp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ }
+
+ xfs_inobp_check(mp, bp);
+
+ /*
+ * Mark the buffer as an inode buffer now that it looks good
+ */
+ XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * This routine is called to map an inode number within a file
+ * system to the buffer containing the on-disk version of the
+ * inode. It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dip parameter
+ * it returns a pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and
+ * dipp are undefined.
+ *
+ * Use xfs_imap() to determine the size and location of the
+ * buffer to read from disk.
+ */
+int
+xfs_inotobp(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ xfs_ino_t ino,
+ xfs_dinode_t **dipp,
+ xfs_buf_t **bpp,
+ int *offset,
+ uint imap_flags)
+{
+ struct xfs_imap imap;
+ xfs_buf_t *bp;
+ int error;
+
+ imap.im_blkno = 0;
+ error = xfs_imap(mp, tp, ino, &imap, imap_flags);
+ if (error)
+ return error;
+
+ error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
+ if (error)
+ return error;
+
+ *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+ *bpp = bp;
+ *offset = imap.im_boffset;
+ return 0;
+}
+
+
+/*
+ * This routine is called to map an inode to the buffer containing
+ * the on-disk version of the inode. It returns a pointer to the
+ * buffer containing the on-disk inode in the bpp parameter, and in
+ * the dip parameter it returns a pointer to the on-disk inode within
+ * that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and
+ * dipp are undefined.
+ *
+ * The inode is expected to already been mapped to its buffer and read
+ * in once, thus we can use the mapping information stored in the inode
+ * rather than calling xfs_imap(). This allows us to avoid the overhead
+ * of looking at the inode btree for small block file systems
+ * (see xfs_imap()).
+ */
+int
+xfs_itobp(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ xfs_inode_t *ip,
+ xfs_dinode_t **dipp,
+ xfs_buf_t **bpp,
+ uint buf_flags)
+{
+ xfs_buf_t *bp;
+ int error;
+
+ ASSERT(ip->i_imap.im_blkno != 0);
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
+ if (error)
+ return error;
+
+ if (!bp) {
+ ASSERT(buf_flags & XBF_TRYLOCK);
+ ASSERT(tp == NULL);
+ *bpp = NULL;
+ return EAGAIN;
+ }
+
+ *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode. For fifos, devs, and sockets
+ * this means set if_rdev to the proper value. For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers. For a file in B-tree format, only the root is immediately
+ * brought in-core. The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+STATIC int
+xfs_iformat(
+ xfs_inode_t *ip,
+ xfs_dinode_t *dip)
+{
+ xfs_attr_shortform_t *atp;
+ int size;
+ int error;
+ xfs_fsize_t di_size;
+ ip->i_df.if_ext_max =
+ XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+ error = 0;
+
+ if (unlikely(be32_to_cpu(dip->di_nextents) +
+ be16_to_cpu(dip->di_anextents) >
+ be64_to_cpu(dip->di_nblocks))) {
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
+ (unsigned long long)ip->i_ino,
+ (int)(be32_to_cpu(dip->di_nextents) +
+ be16_to_cpu(dip->di_anextents)),
+ (unsigned long long)
+ be64_to_cpu(dip->di_nblocks));
+ XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+ xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
+ (unsigned long long)ip->i_ino,
+ dip->di_forkoff);
+ XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+ !ip->i_mount->m_rtdev_targp)) {
+ xfs_warn(ip->i_mount,
+ "corrupt dinode %Lu, has realtime flag set.",
+ ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+ XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ switch (ip->i_d.di_mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
+ XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ ip->i_d.di_size = 0;
+ ip->i_size = 0;
+ ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
+ break;
+
+ case S_IFREG:
+ case S_IFLNK:
+ case S_IFDIR:
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ /*
+ * no local regular files yet
+ */
+ if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
+ xfs_warn(ip->i_mount,
+ "corrupt inode %Lu (local format for regular file).",
+ (unsigned long long) ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat(4)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ di_size = be64_to_cpu(dip->di_size);
+ if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
+ xfs_warn(ip->i_mount,
+ "corrupt inode %Lu (bad size %Ld for local inode).",
+ (unsigned long long) ip->i_ino,
+ (long long) di_size);
+ XFS_CORRUPTION_ERROR("xfs_iformat(5)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ size = (int)di_size;
+ error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+ break;
+ default:
+ XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ break;
+
+ default:
+ XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ if (error) {
+ return error;
+ }
+ if (!XFS_DFORK_Q(dip))
+ return 0;
+ ASSERT(ip->i_afp == NULL);
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+ ip->i_afp->if_ext_max =
+ XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+ switch (dip->di_aformat) {
+ case XFS_DINODE_FMT_LOCAL:
+ atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+ size = be16_to_cpu(atp->hdr.totsize);
+
+ if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+ xfs_warn(ip->i_mount,
+ "corrupt inode %Lu (bad attr fork size %Ld).",
+ (unsigned long long) ip->i_ino,
+ (long long) size);
+ XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+ break;
+ default:
+ error = XFS_ERROR(EFSCORRUPTED);
+ break;
+ }
+ if (error) {
+ kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+ ip->i_afp = NULL;
+ xfs_idestroy_fork(ip, XFS_DATA_FORK);
+ }
+ return error;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there. Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+ xfs_inode_t *ip,
+ xfs_dinode_t *dip,
+ int whichfork,
+ int size)
+{
+ xfs_ifork_t *ifp;
+ int real_size;
+
+ /*
+ * If the size is unreasonable, then something
+ * is wrong and we just bail out rather than crash in
+ * kmem_alloc() or memcpy() below.
+ */
+ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+ xfs_warn(ip->i_mount,
+ "corrupt inode %Lu (bad size %d for local fork, size = %d).",
+ (unsigned long long) ip->i_ino, size,
+ XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+ XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ real_size = 0;
+ if (size == 0)
+ ifp->if_u1.if_data = NULL;
+ else if (size <= sizeof(ifp->if_u2.if_inline_data))
+ ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+ else {
+ real_size = roundup(size, 4);
+ ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+ }
+ ifp->if_bytes = size;
+ ifp->if_real_bytes = real_size;
+ if (size)
+ memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
+ ifp->if_flags &= ~XFS_IFEXTENTS;
+ ifp->if_flags |= XFS_IFINLINE;
+ return 0;
+}
+
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it. Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+ xfs_inode_t *ip,
+ xfs_dinode_t *dip,
+ int whichfork)
+{
+ xfs_bmbt_rec_t *dp;
+ xfs_ifork_t *ifp;
+ int nex;
+ int size;
+ int i;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+ size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+
+ /*
+ * If the number of extents is unreasonable, then something
+ * is wrong and we just bail out rather than crash in
+ * kmem_alloc() or memcpy() below.
+ */
+ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+ xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
+ (unsigned long long) ip->i_ino, nex);
+ XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ ifp->if_real_bytes = 0;
+ if (nex == 0)
+ ifp->if_u1.if_extents = NULL;
+ else if (nex <= XFS_INLINE_EXTS)
+ ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+ else
+ xfs_iext_add(ifp, 0, nex);
+
+ ifp->if_bytes = size;
+ if (size) {
+ dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+ xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
+ for (i = 0; i < nex; i++, dp++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+ ep->l0 = get_unaligned_be64(&dp->l0);
+ ep->l1 = get_unaligned_be64(&dp->l1);
+ }
+ XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
+ if (whichfork != XFS_DATA_FORK ||
+ XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+ if (unlikely(xfs_check_nostate_extents(
+ ifp, 0, nex))) {
+ XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+ XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ }
+ ifp->if_flags |= XFS_IFEXTENTS;
+ return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it. The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+ xfs_inode_t *ip,
+ xfs_dinode_t *dip,
+ int whichfork)
+{
+ xfs_bmdr_block_t *dfp;
+ xfs_ifork_t *ifp;
+ /* REFERENCED */
+ int nrecs;
+ int size;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+ size = XFS_BMAP_BROOT_SPACE(dfp);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+
+ /*
+ * blow out if -- fork has less extents than can fit in
+ * fork (fork shouldn't be a btree format), root btree
+ * block has more records than can fit into the fork,
+ * or the number of extents is greater than the number of
+ * blocks.
+ */
+ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
+ || XFS_BMDR_SPACE_CALC(nrecs) >
+ XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
+ || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+ xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
+ (unsigned long long) ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+ ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ ifp->if_broot_bytes = size;
+ ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+ ASSERT(ifp->if_broot != NULL);
+ /*
+ * Copy and convert from the on-disk structure
+ * to the in-memory structure.
+ */
+ xfs_bmdr_to_bmbt(ip->i_mount, dfp,
+ XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+ ifp->if_broot, size);
+ ifp->if_flags &= ~XFS_IFEXTENTS;
+ ifp->if_flags |= XFS_IFBROOT;
+
+ return 0;
+}
+
+STATIC void
+xfs_dinode_from_disk(
+ xfs_icdinode_t *to,
+ xfs_dinode_t *from)
+{
+ to->di_magic = be16_to_cpu(from->di_magic);
+ to->di_mode = be16_to_cpu(from->di_mode);
+ to->di_version = from ->di_version;
+ to->di_format = from->di_format;
+ to->di_onlink = be16_to_cpu(from->di_onlink);
+ to->di_uid = be32_to_cpu(from->di_uid);
+ to->di_gid = be32_to_cpu(from->di_gid);
+ to->di_nlink = be32_to_cpu(from->di_nlink);
+ to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+ to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+ memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+ to->di_flushiter = be16_to_cpu(from->di_flushiter);
+ to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
+ to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
+ to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
+ to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
+ to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
+ to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+ to->di_size = be64_to_cpu(from->di_size);
+ to->di_nblocks = be64_to_cpu(from->di_nblocks);
+ to->di_extsize = be32_to_cpu(from->di_extsize);
+ to->di_nextents = be32_to_cpu(from->di_nextents);
+ to->di_anextents = be16_to_cpu(from->di_anextents);
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
+ to->di_dmstate = be16_to_cpu(from->di_dmstate);
+ to->di_flags = be16_to_cpu(from->di_flags);
+ to->di_gen = be32_to_cpu(from->di_gen);
+}
+
+void
+xfs_dinode_to_disk(
+ xfs_dinode_t *to,
+ xfs_icdinode_t *from)
+{
+ to->di_magic = cpu_to_be16(from->di_magic);
+ to->di_mode = cpu_to_be16(from->di_mode);
+ to->di_version = from ->di_version;
+ to->di_format = from->di_format;
+ to->di_onlink = cpu_to_be16(from->di_onlink);
+ to->di_uid = cpu_to_be32(from->di_uid);
+ to->di_gid = cpu_to_be32(from->di_gid);
+ to->di_nlink = cpu_to_be32(from->di_nlink);
+ to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+ to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+ memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+ to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
+ to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
+ to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
+ to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
+ to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
+ to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+ to->di_size = cpu_to_be64(from->di_size);
+ to->di_nblocks = cpu_to_be64(from->di_nblocks);
+ to->di_extsize = cpu_to_be32(from->di_extsize);
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+ to->di_dmstate = cpu_to_be16(from->di_dmstate);
+ to->di_flags = cpu_to_be16(from->di_flags);
+ to->di_gen = cpu_to_be32(from->di_gen);
+}
+
+STATIC uint
+_xfs_dic2xflags(
+ __uint16_t di_flags)
+{
+ uint flags = 0;
+
+ if (di_flags & XFS_DIFLAG_ANY) {
+ if (di_flags & XFS_DIFLAG_REALTIME)
+ flags |= XFS_XFLAG_REALTIME;
+ if (di_flags & XFS_DIFLAG_PREALLOC)
+ flags |= XFS_XFLAG_PREALLOC;
+ if (di_flags & XFS_DIFLAG_IMMUTABLE)
+ flags |= XFS_XFLAG_IMMUTABLE;
+ if (di_flags & XFS_DIFLAG_APPEND)
+ flags |= XFS_XFLAG_APPEND;
+ if (di_flags & XFS_DIFLAG_SYNC)
+ flags |= XFS_XFLAG_SYNC;
+ if (di_flags & XFS_DIFLAG_NOATIME)
+ flags |= XFS_XFLAG_NOATIME;
+ if (di_flags & XFS_DIFLAG_NODUMP)
+ flags |= XFS_XFLAG_NODUMP;
+ if (di_flags & XFS_DIFLAG_RTINHERIT)
+ flags |= XFS_XFLAG_RTINHERIT;
+ if (di_flags & XFS_DIFLAG_PROJINHERIT)
+ flags |= XFS_XFLAG_PROJINHERIT;
+ if (di_flags & XFS_DIFLAG_NOSYMLINKS)
+ flags |= XFS_XFLAG_NOSYMLINKS;
+ if (di_flags & XFS_DIFLAG_EXTSIZE)
+ flags |= XFS_XFLAG_EXTSIZE;
+ if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
+ flags |= XFS_XFLAG_EXTSZINHERIT;
+ if (di_flags & XFS_DIFLAG_NODEFRAG)
+ flags |= XFS_XFLAG_NODEFRAG;
+ if (di_flags & XFS_DIFLAG_FILESTREAM)
+ flags |= XFS_XFLAG_FILESTREAM;
+ }
+
+ return flags;
+}
+
+uint
+xfs_ip2xflags(
+ xfs_inode_t *ip)
+{
+ xfs_icdinode_t *dic = &ip->i_d;
+
+ return _xfs_dic2xflags(dic->di_flags) |
+ (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
+}
+
+uint
+xfs_dic2xflags(
+ xfs_dinode_t *dip)
+{
+ return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
+ (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
+}
+
+/*
+ * Read the disk inode attributes into the in-core inode structure.
+ */
+int
+xfs_iread(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ xfs_inode_t *ip,
+ uint iget_flags)
+{
+ xfs_buf_t *bp;
+ xfs_dinode_t *dip;
+ int error;
+
+ /*
+ * Fill in the location information in the in-core inode.
+ */
+ error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
+ if (error)
+ return error;
+
+ /*
+ * Get pointers to the on-disk inode and the buffer containing it.
+ */
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
+ XBF_LOCK, iget_flags);
+ if (error)
+ return error;
+ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+
+ /*
+ * If we got something that isn't an inode it means someone
+ * (nfs or dmi) has a stale handle.
+ */
+ if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
+#ifdef DEBUG
+ xfs_alert(mp,
+ "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
+ __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
+#endif /* DEBUG */
+ error = XFS_ERROR(EINVAL);
+ goto out_brelse;
+ }
+
+ /*
+ * If the on-disk inode is already linked to a directory
+ * entry, copy all of the inode into the in-core inode.
+ * xfs_iformat() handles copying in the inode format
+ * specific information.
+ * Otherwise, just get the truly permanent information.
+ */
+ if (dip->di_mode) {
+ xfs_dinode_from_disk(&ip->i_d, dip);
+ error = xfs_iformat(ip, dip);
+ if (error) {
+#ifdef DEBUG
+ xfs_alert(mp, "%s: xfs_iformat() returned error %d",
+ __func__, error);
+#endif /* DEBUG */
+ goto out_brelse;
+ }
+ } else {
+ ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
+ ip->i_d.di_version = dip->di_version;
+ ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+ ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
+ /*
+ * Make sure to pull in the mode here as well in
+ * case the inode is released without being used.
+ * This ensures that xfs_inactive() will see that
+ * the inode is already free and not try to mess
+ * with the uninitialized part of it.
+ */
+ ip->i_d.di_mode = 0;
+ /*
+ * Initialize the per-fork minima and maxima for a new
+ * inode here. xfs_iformat will do it for old inodes.
+ */
+ ip->i_df.if_ext_max =
+ XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+ }
+
+ /*
+ * The inode format changed when we moved the link count and
+ * made it 32 bits long. If this is an old format inode,
+ * convert it in memory to look like a new one. If it gets
+ * flushed to disk we will convert back before flushing or
+ * logging it. We zero out the new projid field and the old link
+ * count field. We'll handle clearing the pad field (the remains
+ * of the old uuid field) when we actually convert the inode to
+ * the new format. We don't change the version number so that we
+ * can distinguish this from a real new format inode.
+ */
+ if (ip->i_d.di_version == 1) {
+ ip->i_d.di_nlink = ip->i_d.di_onlink;
+ ip->i_d.di_onlink = 0;
+ xfs_set_projid(ip, 0);
+ }
+
+ ip->i_delayed_blks = 0;
+ ip->i_size = ip->i_d.di_size;
+
+ /*
+ * Mark the buffer containing the inode as something to keep
+ * around for a while. This helps to keep recently accessed
+ * meta-data in-core longer.
+ */
+ xfs_buf_set_ref(bp, XFS_INO_REF);
+
+ /*
+ * Use xfs_trans_brelse() to release the buffer containing the
+ * on-disk inode, because it was acquired with xfs_trans_read_buf()
+ * in xfs_itobp() above. If tp is NULL, this is just a normal
+ * brelse(). If we're within a transaction, then xfs_trans_brelse()
+ * will only release the buffer if it is not dirty within the
+ * transaction. It will be OK to release the buffer in this case,
+ * because inodes on disk are never destroyed and we will be
+ * locking the new in-core inode before putting it in the hash
+ * table where other processes can find it. Thus we don't have
+ * to worry about the inode being changed just because we released
+ * the buffer.
+ */
+ out_brelse:
+ xfs_trans_brelse(tp, bp);
+ return error;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip,
+ int whichfork)
+{
+ int error;
+ xfs_ifork_t *ifp;
+ xfs_extnum_t nextents;
+
+ if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+ XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
+ ip->i_mount);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+
+ /*
+ * We know that the size is valid (it's checked in iformat_btree)
+ */
+ ifp->if_bytes = ifp->if_real_bytes = 0;
+ ifp->if_flags |= XFS_IFEXTENTS;
+ xfs_iext_add(ifp, 0, nextents);
+ error = xfs_bmap_read_extents(tp, ip, whichfork);
+ if (error) {
+ xfs_iext_destroy(ifp);
+ ifp->if_flags &= ~XFS_IFEXTENTS;
+ return error;
+ }
+ xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
+ return 0;
+}
+
+/*
+ * Allocate an inode on disk and return a copy of its in-core version.
+ * The in-core inode is locked exclusively. Set mode, nlink, and rdev
+ * appropriately within the inode. The uid and gid for the inode are
+ * set according to the contents of the given cred structure.
+ *
+ * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
+ * has a free inode available, call xfs_iget()
+ * to obtain the in-core version of the allocated inode. Finally,
+ * fill in the inode and log its initial contents. In this case,
+ * ialloc_context would be set to NULL and call_again set to false.
+ *
+ * If xfs_dialloc() does not have an available inode,
+ * it will replenish its supply by doing an allocation. Since we can
+ * only do one allocation within a transaction without deadlocks, we
+ * must commit the current transaction before returning the inode itself.
+ * In this case, therefore, we will set call_again to true and return.
+ * The caller should then commit the current transaction, start a new
+ * transaction, and call xfs_ialloc() again to actually get the inode.
+ *
+ * To ensure that some other process does not grab the inode that
+ * was allocated during the first call to xfs_ialloc(), this routine
+ * also returns the [locked] bp pointing to the head of the freelist
+ * as ialloc_context. The caller should hold this buffer across
+ * the commit and pass it back into this routine on the second call.
+ *
+ * If we are allocating quota inodes, we do not have a parent inode
+ * to attach to or associate with (i.e. pip == NULL) because they
+ * are not linked into the directory structure - they are attached
+ * directly to the superblock - and so have no parent.
+ */
+int
+xfs_ialloc(
+ xfs_trans_t *tp,
+ xfs_inode_t *pip,
+ mode_t mode,
+ xfs_nlink_t nlink,
+ xfs_dev_t rdev,
+ prid_t prid,
+ int okalloc,
+ xfs_buf_t **ialloc_context,
+ boolean_t *call_again,
+ xfs_inode_t **ipp)
+{
+ xfs_ino_t ino;
+ xfs_inode_t *ip;
+ uint flags;
+ int error;
+ timespec_t tv;
+ int filestreams = 0;
+
+ /*
+ * Call the space management code to pick
+ * the on-disk inode to be allocated.
+ */
+ error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
+ ialloc_context, call_again, &ino);
+ if (error)
+ return error;
+ if (*call_again || ino == NULLFSINO) {
+ *ipp = NULL;
+ return 0;
+ }
+ ASSERT(*ialloc_context == NULL);
+
+ /*
+ * Get the in-core inode with the lock held exclusively.
+ * This is because we're setting fields here we need
+ * to prevent others from looking at until we're done.
+ */
+ error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
+ XFS_ILOCK_EXCL, &ip);
+ if (error)
+ return error;
+ ASSERT(ip != NULL);
+
+ ip->i_d.di_mode = (__uint16_t)mode;
+ ip->i_d.di_onlink = 0;
+ ip->i_d.di_nlink = nlink;
+ ASSERT(ip->i_d.di_nlink == nlink);
+ ip->i_d.di_uid = current_fsuid();
+ ip->i_d.di_gid = current_fsgid();
+ xfs_set_projid(ip, prid);
+ memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+
+ /*
+ * If the superblock version is up to where we support new format
+ * inodes and this is currently an old format inode, then change
+ * the inode version number now. This way we only do the conversion
+ * here rather than here and in the flush/logging code.
+ */
+ if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
+ ip->i_d.di_version == 1) {
+ ip->i_d.di_version = 2;
+ /*
+ * We've already zeroed the old link count, the projid field,
+ * and the pad field.
+ */
+ }
+
+ /*
+ * Project ids won't be stored on disk if we are using a version 1 inode.
+ */
+ if ((prid != 0) && (ip->i_d.di_version == 1))
+ xfs_bump_ino_vers2(tp, ip);
+
+ if (pip && XFS_INHERIT_GID(pip)) {
+ ip->i_d.di_gid = pip->i_d.di_gid;
+ if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
+ ip->i_d.di_mode |= S_ISGID;
+ }
+ }
+
+ /*
+ * If the group ID of the new file does not match the effective group
+ * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
+ * (and only if the irix_sgid_inherit compatibility variable is set).
+ */
+ if ((irix_sgid_inherit) &&
+ (ip->i_d.di_mode & S_ISGID) &&
+ (!in_group_p((gid_t)ip->i_d.di_gid))) {
+ ip->i_d.di_mode &= ~S_ISGID;
+ }
+
+ ip->i_d.di_size = 0;
+ ip->i_size = 0;
+ ip->i_d.di_nextents = 0;
+ ASSERT(ip->i_d.di_nblocks == 0);
+
+ nanotime(&tv);
+ ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+ ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+ ip->i_d.di_atime = ip->i_d.di_mtime;
+ ip->i_d.di_ctime = ip->i_d.di_mtime;
+
+ /*
+ * di_gen will have been taken care of in xfs_iread.
+ */
+ ip->i_d.di_extsize = 0;
+ ip->i_d.di_dmevmask = 0;
+ ip->i_d.di_dmstate = 0;
+ ip->i_d.di_flags = 0;
+ flags = XFS_ILOG_CORE;
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ ip->i_d.di_format = XFS_DINODE_FMT_DEV;
+ ip->i_df.if_u2.if_rdev = rdev;
+ ip->i_df.if_flags = 0;
+ flags |= XFS_ILOG_DEV;
+ break;
+ case S_IFREG:
+ /*
+ * we can't set up filestreams until after the VFS inode
+ * is set up properly.
+ */
+ if (pip && xfs_inode_is_filestream(pip))
+ filestreams = 1;
+ /* fall through */
+ case S_IFDIR:
+ if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
+ uint di_flags = 0;
+
+ if ((mode & S_IFMT) == S_IFDIR) {
+ if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
+ di_flags |= XFS_DIFLAG_RTINHERIT;
+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+ ip->i_d.di_extsize = pip->i_d.di_extsize;
+ }
+ } else if ((mode & S_IFMT) == S_IFREG) {
+ if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
+ di_flags |= XFS_DIFLAG_REALTIME;
+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSIZE;
+ ip->i_d.di_extsize = pip->i_d.di_extsize;
+ }
+ }
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
+ xfs_inherit_noatime)
+ di_flags |= XFS_DIFLAG_NOATIME;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
+ xfs_inherit_nodump)
+ di_flags |= XFS_DIFLAG_NODUMP;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
+ xfs_inherit_sync)
+ di_flags |= XFS_DIFLAG_SYNC;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
+ xfs_inherit_nosymlinks)
+ di_flags |= XFS_DIFLAG_NOSYMLINKS;
+ if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
+ xfs_inherit_nodefrag)
+ di_flags |= XFS_DIFLAG_NODEFRAG;
+ if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
+ di_flags |= XFS_DIFLAG_FILESTREAM;
+ ip->i_d.di_flags |= di_flags;
+ }
+ /* FALLTHROUGH */
+ case S_IFLNK:
+ ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+ ip->i_df.if_flags = XFS_IFEXTENTS;
+ ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
+ ip->i_df.if_u1.if_extents = NULL;
+ break;
+ default:
+ ASSERT(0);
+ }
+ /*
+ * Attribute fork settings for new inode.
+ */
+ ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+ ip->i_d.di_anextents = 0;
+
+ /*
+ * Log the new values stuffed into the inode.
+ */
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, flags);
+
+ /* now that we have an i_mode we can setup inode ops and unlock */
+ xfs_setup_inode(ip);
+
+ /* now we have set up the vfs inode we can associate the filestream */
+ if (filestreams) {
+ error = xfs_filestream_associate(pip, ip);
+ if (error < 0)
+ return -error;
+ if (!error)
+ xfs_iflags_set(ip, XFS_IFILESTREAM);
+ }
+
+ *ipp = ip;
+ return 0;
+}
+
+/*
+ * Check to make sure that there are no blocks allocated to the
+ * file beyond the size of the file. We don't check this for
+ * files with fixed size extents or real time extents, but we
+ * at least do it for regular files.
+ */
+#ifdef DEBUG
+void
+xfs_isize_check(
+ xfs_mount_t *mp,
+ xfs_inode_t *ip,
+ xfs_fsize_t isize)
+{
+ xfs_fileoff_t map_first;
+ int nimaps;
+ xfs_bmbt_irec_t imaps[2];
+
+ if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
+ return;
+
+ if (XFS_IS_REALTIME_INODE(ip))
+ return;
+
+ if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
+ return;
+
+ nimaps = 2;
+ map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+ /*
+ * The filesystem could be shutting down, so bmapi may return
+ * an error.
+ */
+ if (xfs_bmapi(NULL, ip, map_first,
+ (XFS_B_TO_FSB(mp,
+ (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
+ map_first),
+ XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
+ NULL))
+ return;
+ ASSERT(nimaps == 1);
+ ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
+}
+#endif /* DEBUG */
+
+/*
+ * Calculate the last possible buffered byte in a file. This must
+ * include data that was buffered beyond the EOF by the write code.
+ * This also needs to deal with overflowing the xfs_fsize_t type
+ * which can happen for sizes near the limit.
+ *
+ * We also need to take into account any blocks beyond the EOF. It
+ * may be the case that they were buffered by a write which failed.
+ * In that case the pages will still be in memory, but the inode size
+ * will never have been updated.
+ */
+STATIC xfs_fsize_t
+xfs_file_last_byte(
+ xfs_inode_t *ip)
+{
+ xfs_mount_t *mp;
+ xfs_fsize_t last_byte;
+ xfs_fileoff_t last_block;
+ xfs_fileoff_t size_last_block;
+ int error;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
+
+ mp = ip->i_mount;
+ /*
+ * Only check for blocks beyond the EOF if the extents have
+ * been read in. This eliminates the need for the inode lock,
+ * and it also saves us from looking when it really isn't
+ * necessary.
+ */
+ if (ip->i_df.if_flags & XFS_IFEXTENTS) {
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ error = xfs_bmap_last_offset(NULL, ip, &last_block,
+ XFS_DATA_FORK);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (error) {
+ last_block = 0;
+ }
+ } else {
+ last_block = 0;
+ }
+ size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size);
+ last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
+
+ last_byte = XFS_FSB_TO_B(mp, last_block);
+ if (last_byte < 0) {
+ return XFS_MAXIOFFSET(mp);
+ }
+ last_byte += (1 << mp->m_writeio_log);
+ if (last_byte < 0) {
+ return XFS_MAXIOFFSET(mp);
+ }
+ return last_byte;
+}
+
+/*
+ * Start the truncation of the file to new_size. The new size
+ * must be smaller than the current size. This routine will
+ * clear the buffer and page caches of file data in the removed
+ * range, and xfs_itruncate_finish() will remove the underlying
+ * disk blocks.
+ *
+ * The inode must have its I/O lock locked EXCLUSIVELY, and it
+ * must NOT have the inode lock held at all. This is because we're
+ * calling into the buffer/page cache code and we can't hold the
+ * inode lock when we do so.
+ *
+ * We need to wait for any direct I/Os in flight to complete before we
+ * proceed with the truncate. This is needed to prevent the extents
+ * being read or written by the direct I/Os from being removed while the
+ * I/O is in flight as there is no other method of synchronising
+ * direct I/O with the truncate operation. Also, because we hold
+ * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
+ * started until the truncate completes and drops the lock. Essentially,
+ * the xfs_ioend_wait() call forms an I/O barrier that provides strict
+ * ordering between direct I/Os and the truncate operation.
+ *
+ * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
+ * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used
+ * in the case that the caller is locking things out of order and
+ * may not be able to call xfs_itruncate_finish() with the inode lock
+ * held without dropping the I/O lock. If the caller must drop the
+ * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
+ * must be called again with all the same restrictions as the initial
+ * call.
+ */
+int
+xfs_itruncate_start(
+ xfs_inode_t *ip,
+ uint flags,
+ xfs_fsize_t new_size)
+{
+ xfs_fsize_t last_byte;
+ xfs_off_t toss_start;
+ xfs_mount_t *mp;
+ int error = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT((new_size == 0) || (new_size <= ip->i_size));
+ ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
+ (flags == XFS_ITRUNC_MAYBE));
+
+ mp = ip->i_mount;
+
+ /* wait for the completion of any pending DIOs */
+ if (new_size == 0 || new_size < ip->i_size)
+ xfs_ioend_wait(ip);
+
+ /*
+ * Call toss_pages or flushinval_pages to get rid of pages
+ * overlapping the region being removed. We have to use
+ * the less efficient flushinval_pages in the case that the
+ * caller may not be able to finish the truncate without
+ * dropping the inode's I/O lock. Make sure
+ * to catch any pages brought in by buffers overlapping
+ * the EOF by searching out beyond the isize by our
+ * block size. We round new_size up to a block boundary
+ * so that we don't toss things on the same block as
+ * new_size but before it.
+ *
+ * Before calling toss_page or flushinval_pages, make sure to
+ * call remapf() over the same region if the file is mapped.
+ * This frees up mapped file references to the pages in the
+ * given range and for the flushinval_pages case it ensures
+ * that we get the latest mapped changes flushed out.
+ */
+ toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+ toss_start = XFS_FSB_TO_B(mp, toss_start);
+ if (toss_start < 0) {
+ /*
+ * The place to start tossing is beyond our maximum
+ * file size, so there is no way that the data extended
+ * out there.
+ */
+ return 0;
+ }
+ last_byte = xfs_file_last_byte(ip);
+ trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte);
+ if (last_byte > toss_start) {
+ if (flags & XFS_ITRUNC_DEFINITE) {
+ xfs_tosspages(ip, toss_start,
+ -1, FI_REMAPF_LOCKED);
+ } else {
+ error = xfs_flushinval_pages(ip, toss_start,
+ -1, FI_REMAPF_LOCKED);
+ }
+ }
+
+#ifdef DEBUG
+ if (new_size == 0) {
+ ASSERT(VN_CACHED(VFS_I(ip)) == 0);
+ }
+#endif
+ return error;
+}
+
+/*
+ * Shrink the file to the given new_size. The new size must be smaller than
+ * the current size. This will free up the underlying blocks in the removed
+ * range after a call to xfs_itruncate_start() or xfs_atruncate_start().
+ *
+ * The transaction passed to this routine must have made a permanent log
+ * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
+ * given transaction and start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here. Some transaction will be
+ * returned to the caller to be committed. The incoming transaction must
+ * already include the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction. On return the inode
+ * will be "held" within the returned transaction. This routine does NOT
+ * require any disk space to be reserved for it within the transaction.
+ *
+ * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it
+ * indicates the fork which is to be truncated. For the attribute fork we only
+ * support truncation to size 0.
+ *
+ * We use the sync parameter to indicate whether or not the first transaction
+ * we perform might have to be synchronous. For the attr fork, it needs to be
+ * so if the unlink of the inode is not yet known to be permanent in the log.
+ * This keeps us from freeing and reusing the blocks of the attribute fork
+ * before the unlink of the inode becomes permanent.
+ *
+ * For the data fork, we normally have to run synchronously if we're being
+ * called out of the inactive path or we're being called out of the create path
+ * where we're truncating an existing file. Either way, the truncate needs to
+ * be sync so blocks don't reappear in the file with altered data in case of a
+ * crash. wsync filesystems can run the first case async because anything that
+ * shrinks the inode has to run sync so by the time we're called here from
+ * inactive, the inode size is permanently set to 0.
+ *
+ * Calls from the truncate path always need to be sync unless we're in a wsync
+ * filesystem and the file has already been unlinked.
+ *
+ * The caller is responsible for correctly setting the sync parameter. It gets
+ * too hard for us to guess here which path we're being called out of just
+ * based on inode state.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not. We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
+ */
+int
+xfs_itruncate_finish(
+ xfs_trans_t **tp,
+ xfs_inode_t *ip,
+ xfs_fsize_t new_size,
+ int fork,
+ int sync)
+{
+ xfs_fsblock_t first_block;
+ xfs_fileoff_t first_unmap_block;
+ xfs_fileoff_t last_block;
+ xfs_filblks_t unmap_len=0;
+ xfs_mount_t *mp;
+ xfs_trans_t *ntp;
+ int done;
+ int committed;
+ xfs_bmap_free_t free_list;
+ int error;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+ ASSERT((new_size == 0) || (new_size <= ip->i_size));
+ ASSERT(*tp != NULL);
+ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(ip->i_transp == *tp);
+ ASSERT(ip->i_itemp != NULL);
+ ASSERT(ip->i_itemp->ili_lock_flags == 0);
+
+
+ ntp = *tp;
+ mp = (ntp)->t_mountp;
+ ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
+
+ /*
+ * We only support truncating the entire attribute fork.
+ */
+ if (fork == XFS_ATTR_FORK) {
+ new_size = 0LL;
+ }
+ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+ trace_xfs_itruncate_finish_start(ip, new_size);
+
+ /*
+ * The first thing we do is set the size to new_size permanently
+ * on disk. This way we don't have to worry about anyone ever
+ * being able to look at the data being freed even in the face
+ * of a crash. What we're getting around here is the case where
+ * we free a block, it is allocated to another file, it is written
+ * to, and then we crash. If the new data gets written to the
+ * file but the log buffers containing the free and reallocation
+ * don't, then we'd end up with garbage in the blocks being freed.
+ * As long as we make the new_size permanent before actually
+ * freeing any blocks it doesn't matter if they get written to.
+ *
+ * The callers must signal into us whether or not the size
+ * setting here must be synchronous. There are a few cases
+ * where it doesn't have to be synchronous. Those cases
+ * occur if the file is unlinked and we know the unlink is
+ * permanent or if the blocks being truncated are guaranteed
+ * to be beyond the inode eof (regardless of the link count)
+ * and the eof value is permanent. Both of these cases occur
+ * only on wsync-mounted filesystems. In those cases, we're
+ * guaranteed that no user will ever see the data in the blocks
+ * that are being truncated so the truncate can run async.
+ * In the free beyond eof case, the file may wind up with
+ * more blocks allocated to it than it needs if we crash
+ * and that won't get fixed until the next time the file
+ * is re-opened and closed but that's ok as that shouldn't
+ * be too many blocks.
+ *
+ * However, we can't just make all wsync xactions run async
+ * because there's one call out of the create path that needs
+ * to run sync where it's truncating an existing file to size
+ * 0 whose size is > 0.
+ *
+ * It's probably possible to come up with a test in this
+ * routine that would correctly distinguish all the above
+ * cases from the values of the function parameters and the
+ * inode state but for sanity's sake, I've decided to let the
+ * layers above just tell us. It's simpler to correctly figure
+ * out in the layer above exactly under what conditions we
+ * can run async and I think it's easier for others read and
+ * follow the logic in case something has to be changed.
+ * cscope is your friend -- rcc.
+ *
+ * The attribute fork is much simpler.
+ *
+ * For the attribute fork we allow the caller to tell us whether
+ * the unlink of the inode that led to this call is yet permanent
+ * in the on disk log. If it is not and we will be freeing extents
+ * in this inode then we make the first transaction synchronous
+ * to make sure that the unlink is permanent by the time we free
+ * the blocks.
+ */
+ if (fork == XFS_DATA_FORK) {
+ if (ip->i_d.di_nextents > 0) {
+ /*
+ * If we are not changing the file size then do
+ * not update the on-disk file size - we may be
+ * called from xfs_inactive_free_eofblocks(). If we
+ * update the on-disk file size and then the system
+ * crashes before the contents of the file are
+ * flushed to disk then the files may be full of
+ * holes (ie NULL files bug).
+ */
+ if (ip->i_size != new_size) {
+ ip->i_d.di_size = new_size;
+ ip->i_size = new_size;
+ xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+ }
+ }
+ }
+
+ /*
+ * Since it is possible for space to become allocated beyond
+ * the end of the file (in a crash where the space is allocated
+ * but the inode size is not yet updated), simply remove any
+ * blocks which show up between the new EOF and the maximum
+ * possible file size. If the first block to be removed is
+ * beyond the maximum file size (ie it is the same as last_block),
+ * then there is nothing to do.
+ */
+ last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+ ASSERT(first_unmap_block <= last_block);
+ done = 0;
+ if (last_block == first_unmap_block) {
+ done = 1;
+ } else {
+ unmap_len = last_block - first_unmap_block + 1;
+ }
+ while (!done) {
+ /*
+ * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi()
+ * will tell us whether it freed the entire range or
+ * not. If this is a synchronous mount (wsync),
+ * then we can tell bunmapi to keep all the
+ * transactions asynchronous since the unlink
+ * transaction that made this inode inactive has
+ * already hit the disk. There's no danger of
+ * the freed blocks being reused, there being a
+ * crash, and the reused blocks suddenly reappearing
+ * in this file with garbage in them once recovery
+ * runs.
+ */
+ xfs_bmap_init(&free_list, &first_block);
+ error = xfs_bunmapi(ntp, ip,
+ first_unmap_block, unmap_len,
+ xfs_bmapi_aflag(fork),
+ XFS_ITRUNC_MAX_EXTENTS,
+ &first_block, &free_list,
+ &done);
+ if (error) {
+ /*
+ * If the bunmapi call encounters an error,
+ * return to the caller where the transaction
+ * can be properly aborted. We just need to
+ * make sure we're not holding any resources
+ * that we were not when we came in.
+ */
+ xfs_bmap_cancel(&free_list);
+ return error;
+ }
+
+ /*
+ * Duplicate the transaction that has the permanent
+ * reservation and commit the old transaction.
+ */
+ error = xfs_bmap_finish(tp, &free_list, &committed);
+ ntp = *tp;
+ if (committed)
+ xfs_trans_ijoin(ntp, ip);
+
+ if (error) {
+ /*
+ * If the bmap finish call encounters an error, return
+ * to the caller where the transaction can be properly
+ * aborted. We just need to make sure we're not
+ * holding any resources that we were not when we came
+ * in.
+ *
+ * Aborting from this point might lose some blocks in
+ * the file system, but oh well.
+ */
+ xfs_bmap_cancel(&free_list);
+ return error;
+ }
+
+ if (committed) {
+ /*
+ * Mark the inode dirty so it will be logged and
+ * moved forward in the log as part of every commit.
+ */
+ xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+ }
+
+ ntp = xfs_trans_dup(ntp);
+ error = xfs_trans_commit(*tp, 0);
+ *tp = ntp;
+
+ xfs_trans_ijoin(ntp, ip);
+
+ if (error)
+ return error;
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(ntp->t_ticket);
+ error = xfs_trans_reserve(ntp, 0,
+ XFS_ITRUNCATE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_ITRUNCATE_LOG_COUNT);
+ if (error)
+ return error;
+ }
+ /*
+ * Only update the size in the case of the data fork, but
+ * always re-log the inode so that our permanent transaction
+ * can keep on rolling it forward in the log.
+ */
+ if (fork == XFS_DATA_FORK) {
+ xfs_isize_check(mp, ip, new_size);
+ /*
+ * If we are not changing the file size then do
+ * not update the on-disk file size - we may be
+ * called from xfs_inactive_free_eofblocks(). If we
+ * update the on-disk file size and then the system
+ * crashes before the contents of the file are
+ * flushed to disk then the files may be full of
+ * holes (ie NULL files bug).
+ */
+ if (ip->i_size != new_size) {
+ ip->i_d.di_size = new_size;
+ ip->i_size = new_size;
+ }
+ }
+ xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+ ASSERT((new_size != 0) ||
+ (fork == XFS_ATTR_FORK) ||
+ (ip->i_delayed_blks == 0));
+ ASSERT((new_size != 0) ||
+ (fork == XFS_ATTR_FORK) ||
+ (ip->i_d.di_nextents == 0));
+ trace_xfs_itruncate_finish_end(ip, new_size);
+ return 0;
+}
+
+/*
+ * This is called when the inode's link count goes to 0.
+ * We place the on-disk inode on a list in the AGI. It
+ * will be pulled from this list when the inode is freed.
+ */
+int
+xfs_iunlink(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip)
+{
+ xfs_mount_t *mp;
+ xfs_agi_t *agi;
+ xfs_dinode_t *dip;
+ xfs_buf_t *agibp;
+ xfs_buf_t *ibp;
+ xfs_agino_t agino;
+ short bucket_index;
+ int offset;
+ int error;
+
+ ASSERT(ip->i_d.di_nlink == 0);
+ ASSERT(ip->i_d.di_mode != 0);
+ ASSERT(ip->i_transp == tp);
+
+ mp = tp->t_mountp;
+
+ /*
+ * Get the agi buffer first. It ensures lock ordering
+ * on the list.
+ */
+ error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+ if (error)
+ return error;
+ agi = XFS_BUF_TO_AGI(agibp);
+
+ /*
+ * Get the index into the agi hash table for the
+ * list this inode will go on.
+ */
+ agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ ASSERT(agino != 0);
+ bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ ASSERT(agi->agi_unlinked[bucket_index]);
+ ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+
+ if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) {
+ /*
+ * There is already another inode in the bucket we need
+ * to add ourselves to. Add us at the front of the list.
+ * Here we put the head pointer into our next pointer,
+ * and then we fall through to point the head at us.
+ */
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+ if (error)
+ return error;
+
+ ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
+ /* both on-disk, don't endian flip twice */
+ dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
+ offset = ip->i_imap.im_boffset +
+ offsetof(xfs_dinode_t, di_next_unlinked);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset,
+ (offset + sizeof(xfs_agino_t) - 1));
+ xfs_inobp_check(mp, ibp);
+ }
+
+ /*
+ * Point the bucket head pointer at the inode being inserted.
+ */
+ ASSERT(agino != 0);
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
+ offset = offsetof(xfs_agi_t, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket_index);
+ xfs_trans_log_buf(tp, agibp, offset,
+ (offset + sizeof(xfs_agino_t) - 1));
+ return 0;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+STATIC int
+xfs_iunlink_remove(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip)
+{
+ xfs_ino_t next_ino;
+ xfs_mount_t *mp;
+ xfs_agi_t *agi;
+ xfs_dinode_t *dip;
+ xfs_buf_t *agibp;
+ xfs_buf_t *ibp;
+ xfs_agnumber_t agno;
+ xfs_agino_t agino;
+ xfs_agino_t next_agino;
+ xfs_buf_t *last_ibp;
+ xfs_dinode_t *last_dip = NULL;
+ short bucket_index;
+ int offset, last_offset = 0;
+ int error;
+
+ mp = tp->t_mountp;
+ agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+
+ /*
+ * Get the agi buffer first. It ensures lock ordering
+ * on the list.
+ */
+ error = xfs_read_agi(mp, tp, agno, &agibp);
+ if (error)
+ return error;
+
+ agi = XFS_BUF_TO_AGI(agibp);
+
+ /*
+ * Get the index into the agi hash table for the
+ * list this inode will go on.
+ */
+ agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ ASSERT(agino != 0);
+ bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO);
+ ASSERT(agi->agi_unlinked[bucket_index]);
+
+ if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
+ /*
+ * We're at the head of the list. Get the inode's
+ * on-disk buffer to see if there is anyone after us
+ * on the list. Only modify our next pointer if it
+ * is not already NULLAGINO. This saves us the overhead
+ * of dealing with the buffer when there is no need to
+ * change it.
+ */
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
+ __func__, error);
+ return error;
+ }
+ next_agino = be32_to_cpu(dip->di_next_unlinked);
+ ASSERT(next_agino != 0);
+ if (next_agino != NULLAGINO) {
+ dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ offset = ip->i_imap.im_boffset +
+ offsetof(xfs_dinode_t, di_next_unlinked);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset,
+ (offset + sizeof(xfs_agino_t) - 1));
+ xfs_inobp_check(mp, ibp);
+ } else {
+ xfs_trans_brelse(tp, ibp);
+ }
+ /*
+ * Point the bucket head pointer at the next inode.
+ */
+ ASSERT(next_agino != 0);
+ ASSERT(next_agino != agino);
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
+ offset = offsetof(xfs_agi_t, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket_index);
+ xfs_trans_log_buf(tp, agibp, offset,
+ (offset + sizeof(xfs_agino_t) - 1));
+ } else {
+ /*
+ * We need to search the list for the inode being freed.
+ */
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ last_ibp = NULL;
+ while (next_agino != agino) {
+ /*
+ * If the last inode wasn't the one pointing to
+ * us, then release its buffer since we're not
+ * going to do anything with it.
+ */
+ if (last_ibp != NULL) {
+ xfs_trans_brelse(tp, last_ibp);
+ }
+ next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+ error = xfs_inotobp(mp, tp, next_ino, &last_dip,
+ &last_ibp, &last_offset, 0);
+ if (error) {
+ xfs_warn(mp,
+ "%s: xfs_inotobp() returned error %d.",
+ __func__, error);
+ return error;
+ }
+ next_agino = be32_to_cpu(last_dip->di_next_unlinked);
+ ASSERT(next_agino != NULLAGINO);
+ ASSERT(next_agino != 0);
+ }
+ /*
+ * Now last_ibp points to the buffer previous to us on
+ * the unlinked list. Pull us from the list.
+ */
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
+ __func__, error);
+ return error;
+ }
+ next_agino = be32_to_cpu(dip->di_next_unlinked);
+ ASSERT(next_agino != 0);
+ ASSERT(next_agino != agino);
+ if (next_agino != NULLAGINO) {
+ dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ offset = ip->i_imap.im_boffset +
+ offsetof(xfs_dinode_t, di_next_unlinked);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset,
+ (offset + sizeof(xfs_agino_t) - 1));
+ xfs_inobp_check(mp, ibp);
+ } else {
+ xfs_trans_brelse(tp, ibp);
+ }
+ /*
+ * Point the previous inode on the list to the next inode.
+ */
+ last_dip->di_next_unlinked = cpu_to_be32(next_agino);
+ ASSERT(next_agino != 0);
+ offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+ xfs_trans_inode_buf(tp, last_ibp);
+ xfs_trans_log_buf(tp, last_ibp, offset,
+ (offset + sizeof(xfs_agino_t) - 1));
+ xfs_inobp_check(mp, last_ibp);
+ }
+ return 0;
+}
+
+/*
+ * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
+STATIC void
+xfs_ifree_cluster(
+ xfs_inode_t *free_ip,
+ xfs_trans_t *tp,
+ xfs_ino_t inum)
+{
+ xfs_mount_t *mp = free_ip->i_mount;
+ int blks_per_cluster;
+ int nbufs;
+ int ninodes;
+ int i, j;
+ xfs_daddr_t blkno;
+ xfs_buf_t *bp;
+ xfs_inode_t *ip;
+ xfs_inode_log_item_t *iip;
+ xfs_log_item_t *lip;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
+ if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+ blks_per_cluster = 1;
+ ninodes = mp->m_sb.sb_inopblock;
+ nbufs = XFS_IALLOC_BLOCKS(mp);
+ } else {
+ blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+ mp->m_sb.sb_blocksize;
+ ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+ nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
+ }
+
+ for (j = 0; j < nbufs; j++, inum += ninodes) {
+ blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
+ XFS_INO_TO_AGBNO(mp, inum));
+
+ /*
+ * We obtain and lock the backing buffer first in the process
+ * here, as we have to ensure that any dirty inode that we
+ * can't get the flush lock on is attached to the buffer.
+ * If we scan the in-memory inodes first, then buffer IO can
+ * complete before we get a lock on it, and hence we may fail
+ * to mark all the active inodes on the buffer stale.
+ */
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+ mp->m_bsize * blks_per_cluster,
+ XBF_LOCK);
+
+ /*
+ * Walk the inodes already attached to the buffer and mark them
+ * stale. These will all have the flush locks held, so an
+ * in-memory inode walk can't lock them. By marking them all
+ * stale first, we will not attempt to lock them in the loop
+ * below as the XFS_ISTALE flag will be set.
+ */
+ lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ while (lip) {
+ if (lip->li_type == XFS_LI_INODE) {
+ iip = (xfs_inode_log_item_t *)lip;
+ ASSERT(iip->ili_logged == 1);
+ lip->li_cb = xfs_istale_done;
+ xfs_trans_ail_copy_lsn(mp->m_ail,
+ &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
+ xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+ }
+ lip = lip->li_bio_list;
+ }
+
+
+ /*
+ * For each inode in memory attempt to add it to the inode
+ * buffer and set it up for being staled on buffer IO
+ * completion. This is safe as we've locked out tail pushing
+ * and flushing by locking the buffer.
+ *
+ * We have already marked every inode that was part of a
+ * transaction stale above, which means there is no point in
+ * even trying to lock them.
+ */
+ for (i = 0; i < ninodes; i++) {
+retry:
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(mp, (inum + i)));
+
+ /* Inode not in memory, nothing to do */
+ if (!ip) {
+ rcu_read_unlock();
+ continue;
+ }
+
+ /*
+ * because this is an RCU protected lookup, we could
+ * find a recently freed or even reallocated inode
+ * during the lookup. We need to check under the
+ * i_flags_lock for a valid inode here. Skip it if it
+ * is not valid, the wrong inode or stale.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != inum + i ||
+ __xfs_iflags_test(ip, XFS_ISTALE)) {
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ continue;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Don't try to lock/unlock the current inode, but we
+ * _cannot_ skip the other inodes that we did not find
+ * in the list attached to the buffer and are not
+ * already marked stale. If we can't lock it, back off
+ * and retry.
+ */
+ if (ip != free_ip &&
+ !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ rcu_read_unlock();
+ delay(1);
+ goto retry;
+ }
+ rcu_read_unlock();
+
+ xfs_iflock(ip);
+ xfs_iflags_set(ip, XFS_ISTALE);
+
+ /*
+ * we don't need to attach clean inodes or those only
+ * with unlogged changes (which we throw away, anyway).
+ */
+ iip = ip->i_itemp;
+ if (!iip || xfs_inode_clean(ip)) {
+ ASSERT(ip != free_ip);
+ ip->i_update_core = 0;
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ continue;
+ }
+
+ iip->ili_last_fields = iip->ili_format.ilf_fields;
+ iip->ili_format.ilf_fields = 0;
+ iip->ili_logged = 1;
+ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
+
+ xfs_buf_attach_iodone(bp, xfs_istale_done,
+ &iip->ili_item);
+
+ if (ip != free_ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ xfs_trans_stale_inode_buf(tp, bp);
+ xfs_trans_binval(tp, bp);
+ }
+
+ xfs_perag_put(pag);
+}
+
+/*
+ * This is called to return an inode to the inode free list.
+ * The inode should already be truncated to 0 length and have
+ * no pages associated with it. This routine also assumes that
+ * the inode is already a part of the transaction.
+ *
+ * The on-disk copy of the inode will have been added to the list
+ * of unlinked inodes in the AGI. We need to remove the inode from
+ * that list atomically with respect to freeing it here.
+ */
+int
+xfs_ifree(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip,
+ xfs_bmap_free_t *flist)
+{
+ int error;
+ int delete;
+ xfs_ino_t first_ino;
+ xfs_dinode_t *dip;
+ xfs_buf_t *ibp;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(ip->i_transp == tp);
+ ASSERT(ip->i_d.di_nlink == 0);
+ ASSERT(ip->i_d.di_nextents == 0);
+ ASSERT(ip->i_d.di_anextents == 0);
+ ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
+ ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
+ ASSERT(ip->i_d.di_nblocks == 0);
+
+ /*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+ error = xfs_iunlink_remove(tp, ip);
+ if (error != 0) {
+ return error;
+ }
+
+ error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
+ if (error != 0) {
+ return error;
+ }
+ ip->i_d.di_mode = 0; /* mark incore inode as free */
+ ip->i_d.di_flags = 0;
+ ip->i_d.di_dmevmask = 0;
+ ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
+ ip->i_df.if_ext_max =
+ XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+ ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+ ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+ /*
+ * Bump the generation count so no one will be confused
+ * by reincarnations of this inode.
+ */
+ ip->i_d.di_gen++;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
+ if (error)
+ return error;
+
+ /*
+ * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
+ * from picking up this inode when it is reclaimed (its incore state
+ * initialzed but not flushed to disk yet). The in-core di_mode is
+ * already cleared and a corresponding transaction logged.
+ * The hack here just synchronizes the in-core to on-disk
+ * di_mode value in advance before the actual inode sync to disk.
+ * This is OK because the inode is already unlinked and would never
+ * change its di_mode again for this inode generation.
+ * This is a temporary hack that would require a proper fix
+ * in the future.
+ */
+ dip->di_mode = 0;
+
+ if (delete) {
+ xfs_ifree_cluster(ip, tp, first_ino);
+ }
+
+ return 0;
+}
+
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff. Move the records
+ * and pointers in if_broot to fit the new size. When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller. When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root. If the if_broot is currently NULL, then
+ * if we adding records one will be allocated. The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ * requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+ xfs_inode_t *ip,
+ int rec_diff,
+ int whichfork)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int cur_max;
+ xfs_ifork_t *ifp;
+ struct xfs_btree_block *new_broot;
+ int new_max;
+ size_t new_size;
+ char *np;
+ char *op;
+
+ /*
+ * Handle the degenerate case quietly.
+ */
+ if (rec_diff == 0) {
+ return;
+ }
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (rec_diff > 0) {
+ /*
+ * If there wasn't any memory allocated before, just
+ * allocate it now and get out.
+ */
+ if (ifp->if_broot_bytes == 0) {
+ new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
+ ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+ ifp->if_broot_bytes = (int)new_size;
+ return;
+ }
+
+ /*
+ * If there is already an existing if_broot, then we need
+ * to realloc() it and shift the pointers to their new
+ * location. The records don't change location because
+ * they are kept butted up against the btree block header.
+ */
+ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+ new_max = cur_max + rec_diff;
+ new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+ ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
+ (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
+ KM_SLEEP | KM_NOFS);
+ op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ ifp->if_broot_bytes);
+ np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ (int)new_size);
+ ifp->if_broot_bytes = (int)new_size;
+ ASSERT(ifp->if_broot_bytes <=
+ XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+ memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
+ return;
+ }
+
+ /*
+ * rec_diff is less than 0. In this case, we are shrinking the
+ * if_broot buffer. It must already exist. If we go to zero
+ * records, just get rid of the root and clear the status bit.
+ */
+ ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+ new_max = cur_max + rec_diff;
+ ASSERT(new_max >= 0);
+ if (new_max > 0)
+ new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+ else
+ new_size = 0;
+ if (new_size > 0) {
+ new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+ /*
+ * First copy over the btree block header.
+ */
+ memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
+ } else {
+ new_broot = NULL;
+ ifp->if_flags &= ~XFS_IFBROOT;
+ }
+
+ /*
+ * Only copy the records and pointers if there are any.
+ */
+ if (new_max > 0) {
+ /*
+ * First copy the records.
+ */
+ op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+ np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+ memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+ /*
+ * Then copy the pointers.
+ */
+ op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ ifp->if_broot_bytes);
+ np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+ (int)new_size);
+ memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
+ }
+ kmem_free(ifp->if_broot);
+ ifp->if_broot = new_broot;
+ ifp->if_broot_bytes = (int)new_size;
+ ASSERT(ifp->if_broot_bytes <=
+ XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+ return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased. The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer. Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ * requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+ xfs_inode_t *ip,
+ int byte_diff,
+ int whichfork)
+{
+ xfs_ifork_t *ifp;
+ int new_size;
+ int real_size;
+
+ if (byte_diff == 0) {
+ return;
+ }
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ new_size = (int)ifp->if_bytes + byte_diff;
+ ASSERT(new_size >= 0);
+
+ if (new_size == 0) {
+ if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+ kmem_free(ifp->if_u1.if_data);
+ }
+ ifp->if_u1.if_data = NULL;
+ real_size = 0;
+ } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+ /*
+ * If the valid extents/data can fit in if_inline_ext/data,
+ * copy them from the malloc'd vector and free it.
+ */
+ if (ifp->if_u1.if_data == NULL) {
+ ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+ } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+ ASSERT(ifp->if_real_bytes != 0);
+ memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+ new_size);
+ kmem_free(ifp->if_u1.if_data);
+ ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+ }
+ real_size = 0;
+ } else {
+ /*
+ * Stuck with malloc/realloc.
+ * For inline data, the underlying buffer must be
+ * a multiple of 4 bytes in size so that it can be
+ * logged and stay on word boundaries. We enforce
+ * that here.
+ */
+ real_size = roundup(new_size, 4);
+ if (ifp->if_u1.if_data == NULL) {
+ ASSERT(ifp->if_real_bytes == 0);
+ ifp->if_u1.if_data = kmem_alloc(real_size,
+ KM_SLEEP | KM_NOFS);
+ } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+ /*
+ * Only do the realloc if the underlying size
+ * is really changing.
+ */
+ if (ifp->if_real_bytes != real_size) {
+ ifp->if_u1.if_data =
+ kmem_realloc(ifp->if_u1.if_data,
+ real_size,
+ ifp->if_real_bytes,
+ KM_SLEEP | KM_NOFS);
+ }
+ } else {
+ ASSERT(ifp->if_real_bytes == 0);
+ ifp->if_u1.if_data = kmem_alloc(real_size,
+ KM_SLEEP | KM_NOFS);
+ memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+ ifp->if_bytes);
+ }
+ }
+ ifp->if_real_bytes = real_size;
+ ifp->if_bytes = new_size;
+ ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+
+void
+xfs_idestroy_fork(
+ xfs_inode_t *ip,
+ int whichfork)
+{
+ xfs_ifork_t *ifp;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (ifp->if_broot != NULL) {
+ kmem_free(ifp->if_broot);
+ ifp->if_broot = NULL;
+ }
+
+ /*
+ * If the format is local, then we can't have an extents
+ * array so just look for an inline data array. If we're
+ * not local then we may or may not have an extents list,
+ * so check and free it up if we do.
+ */
+ if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+ (ifp->if_u1.if_data != NULL)) {
+ ASSERT(ifp->if_real_bytes != 0);
+ kmem_free(ifp->if_u1.if_data);
+ ifp->if_u1.if_data = NULL;
+ ifp->if_real_bytes = 0;
+ }
+ } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+ ((ifp->if_flags & XFS_IFEXTIREC) ||
+ ((ifp->if_u1.if_extents != NULL) &&
+ (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
+ ASSERT(ifp->if_real_bytes != 0);
+ xfs_iext_destroy(ifp);
+ }
+ ASSERT(ifp->if_u1.if_extents == NULL ||
+ ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+ ASSERT(ifp->if_real_bytes == 0);
+ if (whichfork == XFS_ATTR_FORK) {
+ kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+ ip->i_afp = NULL;
+ }
+}
+
+/*
+ * This is called to unpin an inode. The caller must have the inode locked
+ * in at least shared mode so that the buffer cannot be subsequently pinned
+ * once someone is waiting for it to be unpinned.
+ */
+static void
+xfs_iunpin_nowait(
+ struct xfs_inode *ip)
+{
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+
+ trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+
+ /* Give the log a push to start the unpinning I/O */
+ xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
+
+}
+
+void
+xfs_iunpin_wait(
+ struct xfs_inode *ip)
+{
+ if (xfs_ipincount(ip)) {
+ xfs_iunpin_nowait(ip);
+ wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
+ }
+}
+
+/*
+ * xfs_iextents_copy()
+ *
+ * This is called to copy the REAL extents (as opposed to the delayed
+ * allocation extents) from the inode into the given buffer. It
+ * returns the number of bytes copied into the buffer.
+ *
+ * If there are no delayed allocation extents, then we can just
+ * memcpy() the extents into the buffer. Otherwise, we need to
+ * examine each extent in turn and skip those which are delayed.
+ */
+int
+xfs_iextents_copy(
+ xfs_inode_t *ip,
+ xfs_bmbt_rec_t *dp,
+ int whichfork)
+{
+ int copied;
+ int i;
+ xfs_ifork_t *ifp;
+ int nrecs;
+ xfs_fsblock_t start_block;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ ASSERT(ifp->if_bytes > 0);
+
+ nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
+ ASSERT(nrecs > 0);
+
+ /*
+ * There are some delayed allocation extents in the
+ * inode, so copy the extents one at a time and skip
+ * the delayed ones. There must be at least one
+ * non-delayed extent.
+ */
+ copied = 0;
+ for (i = 0; i < nrecs; i++) {
+ xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+ start_block = xfs_bmbt_get_startblock(ep);
+ if (isnullstartblock(start_block)) {
+ /*
+ * It's a delayed allocation extent, so skip it.
+ */
+ continue;
+ }
+
+ /* Translate to on disk format */
+ put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
+ put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
+ dp++;
+ copied++;
+ }
+ ASSERT(copied != 0);
+ xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
+
+ return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_iflush_fork(
+ xfs_inode_t *ip,
+ xfs_dinode_t *dip,
+ xfs_inode_log_item_t *iip,
+ int whichfork,
+ xfs_buf_t *bp)
+{
+ char *cp;
+ xfs_ifork_t *ifp;
+ xfs_mount_t *mp;
+#ifdef XFS_TRANS_DEBUG
+ int first;
+#endif
+ static const short brootflag[2] =
+ { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+ static const short dataflag[2] =
+ { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+ static const short extflag[2] =
+ { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+ if (!iip)
+ return;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ /*
+ * This can happen if we gave up in iformat in an error path,
+ * for the attribute fork.
+ */
+ if (!ifp) {
+ ASSERT(whichfork == XFS_ATTR_FORK);
+ return;
+ }
+ cp = XFS_DFORK_PTR(dip, whichfork);
+ mp = ip->i_mount;
+ switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ case XFS_DINODE_FMT_LOCAL:
+ if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
+ (ifp->if_bytes > 0)) {
+ ASSERT(ifp->if_u1.if_data != NULL);
+ ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+ memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+ }
+ break;
+
+ case XFS_DINODE_FMT_EXTENTS:
+ ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+ !(iip->ili_format.ilf_fields & extflag[whichfork]));
+ if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
+ (ifp->if_bytes > 0)) {
+ ASSERT(xfs_iext_get_ext(ifp, 0));
+ ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+ (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+ whichfork);
+ }
+ break;
+
+ case XFS_DINODE_FMT_BTREE:
+ if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
+ (ifp->if_broot_bytes > 0)) {
+ ASSERT(ifp->if_broot != NULL);
+ ASSERT(ifp->if_broot_bytes <=
+ (XFS_IFORK_SIZE(ip, whichfork) +
+ XFS_BROOT_SIZE_ADJ));
+ xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
+ (xfs_bmdr_block_t *)cp,
+ XFS_DFORK_SIZE(dip, mp, whichfork));
+ }
+ break;
+
+ case XFS_DINODE_FMT_DEV:
+ if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+ ASSERT(whichfork == XFS_DATA_FORK);
+ xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
+ }
+ break;
+
+ case XFS_DINODE_FMT_UUID:
+ if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+ ASSERT(whichfork == XFS_DATA_FORK);
+ memcpy(XFS_DFORK_DPTR(dip),
+ &ip->i_df.if_u2.if_uuid,
+ sizeof(uuid_t));
+ }
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
+STATIC int
+xfs_iflush_cluster(
+ xfs_inode_t *ip,
+ xfs_buf_t *bp)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ unsigned long first_index, mask;
+ unsigned long inodes_per_cluster;
+ int ilist_size;
+ xfs_inode_t **ilist;
+ xfs_inode_t *iq;
+ int nr_found;
+ int clcount = 0;
+ int bufwasdelwri;
+ int i;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+ inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
+ ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+ ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
+ if (!ilist)
+ goto out_put;
+
+ mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
+ rcu_read_lock();
+ /* really need a gang lookup range call here */
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
+ first_index, inodes_per_cluster);
+ if (nr_found == 0)
+ goto out_free;
+
+ for (i = 0; i < nr_found; i++) {
+ iq = ilist[i];
+ if (iq == ip)
+ continue;
+
+ /*
+ * because this is an RCU protected lookup, we could find a
+ * recently freed or even reallocated inode during the lookup.
+ * We need to check under the i_flags_lock for a valid inode
+ * here. Skip it if it is not valid or the wrong inode.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (!ip->i_ino ||
+ (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+ spin_unlock(&ip->i_flags_lock);
+ continue;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Do an un-protected check to see if the inode is dirty and
+ * is a candidate for flushing. These checks will be repeated
+ * later after the appropriate locks are acquired.
+ */
+ if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
+ continue;
+
+ /*
+ * Try to get locks. If any are unavailable or it is pinned,
+ * then this inode cannot be flushed and is skipped.
+ */
+
+ if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
+ continue;
+ if (!xfs_iflock_nowait(iq)) {
+ xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ continue;
+ }
+ if (xfs_ipincount(iq)) {
+ xfs_ifunlock(iq);
+ xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ continue;
+ }
+
+ /*
+ * arriving here means that this inode can be flushed. First
+ * re-check that it's dirty before flushing.
+ */
+ if (!xfs_inode_clean(iq)) {
+ int error;
+ error = xfs_iflush_int(iq, bp);
+ if (error) {
+ xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ goto cluster_corrupt_out;
+ }
+ clcount++;
+ } else {
+ xfs_ifunlock(iq);
+ }
+ xfs_iunlock(iq, XFS_ILOCK_SHARED);
+ }
+
+ if (clcount) {
+ XFS_STATS_INC(xs_icluster_flushcnt);
+ XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+ }
+
+out_free:
+ rcu_read_unlock();
+ kmem_free(ilist);
+out_put:
+ xfs_perag_put(pag);
+ return 0;
+
+
+cluster_corrupt_out:
+ /*
+ * Corruption detected in the clustering loop. Invalidate the
+ * inode buffer and shut down the filesystem.
+ */
+ rcu_read_unlock();
+ /*
+ * Clean up the buffer. If it was B_DELWRI, just release it --
+ * brelse can handle it with no problems. If not, shut down the
+ * filesystem before releasing the buffer.
+ */
+ bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
+ if (bufwasdelwri)
+ xfs_buf_relse(bp);
+
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+
+ if (!bufwasdelwri) {
+ /*
+ * Just like incore_relse: if we have b_iodone functions,
+ * mark the buffer as an error and call them. Otherwise
+ * mark it as stale and brelse.
+ */
+ if (XFS_BUF_IODONE_FUNC(bp)) {
+ XFS_BUF_UNDONE(bp);
+ XFS_BUF_STALE(bp);
+ XFS_BUF_ERROR(bp,EIO);
+ xfs_buf_ioend(bp, 0);
+ } else {
+ XFS_BUF_STALE(bp);
+ xfs_buf_relse(bp);
+ }
+ }
+
+ /*
+ * Unlocks the flush lock
+ */
+ xfs_iflush_abort(iq);
+ kmem_free(ilist);
+ xfs_perag_put(pag);
+ return XFS_ERROR(EFSCORRUPTED);
+}
+
+/*
+ * xfs_iflush() will write a modified inode's changes out to the
+ * inode's on disk home. The caller must have the inode lock held
+ * in at least shared mode and the inode flush completion must be
+ * active as well. The inode lock will still be held upon return from
+ * the call and the caller is free to unlock it.
+ * The inode flush will be completed when the inode reaches the disk.
+ * The flags indicate how the inode's buffer should be written out.
+ */
+int
+xfs_iflush(
+ xfs_inode_t *ip,
+ uint flags)
+{
+ xfs_inode_log_item_t *iip;
+ xfs_buf_t *bp;
+ xfs_dinode_t *dip;
+ xfs_mount_t *mp;
+ int error;
+
+ XFS_STATS_INC(xs_iflush_count);
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+ ip->i_d.di_nextents > ip->i_df.if_ext_max);
+
+ iip = ip->i_itemp;
+ mp = ip->i_mount;
+
+ /*
+ * We can't flush the inode until it is unpinned, so wait for it if we
+ * are allowed to block. We know no one new can pin it, because we are
+ * holding the inode lock shared and you need to hold it exclusively to
+ * pin the inode.
+ *
+ * If we are not allowed to block, force the log out asynchronously so
+ * that when we come back the inode will be unpinned. If other inodes
+ * in the same cluster are dirty, they will probably write the inode
+ * out for us if they occur after the log force completes.
+ */
+ if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
+ xfs_iunpin_nowait(ip);
+ xfs_ifunlock(ip);
+ return EAGAIN;
+ }
+ xfs_iunpin_wait(ip);
+
+ /*
+ * For stale inodes we cannot rely on the backing buffer remaining
+ * stale in cache for the remaining life of the stale inode and so
+ * xfs_itobp() below may give us a buffer that no longer contains
+ * inodes below. We have to check this after ensuring the inode is
+ * unpinned so that it is safe to reclaim the stale inode after the
+ * flush call.
+ */
+ if (xfs_iflags_test(ip, XFS_ISTALE)) {
+ xfs_ifunlock(ip);
+ return 0;
+ }
+
+ /*
+ * This may have been unpinned because the filesystem is shutting
+ * down forcibly. If that's the case we must not write this inode
+ * to disk, because the log record didn't make it to disk!
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ ip->i_update_core = 0;
+ if (iip)
+ iip->ili_format.ilf_fields = 0;
+ xfs_ifunlock(ip);
+ return XFS_ERROR(EIO);
+ }
+
+ /*
+ * Get the buffer containing the on-disk inode.
+ */
+ error = xfs_itobp(mp, NULL, ip, &dip, &bp,
+ (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
+ if (error || !bp) {
+ xfs_ifunlock(ip);
+ return error;
+ }
+
+ /*
+ * First flush out the inode that xfs_iflush was called with.
+ */
+ error = xfs_iflush_int(ip, bp);
+ if (error)
+ goto corrupt_out;
+
+ /*
+ * If the buffer is pinned then push on the log now so we won't
+ * get stuck waiting in the write for too long.
+ */
+ if (XFS_BUF_ISPINNED(bp))
+ xfs_log_force(mp, 0);
+
+ /*
+ * inode clustering:
+ * see if other inodes can be gathered into this write
+ */
+ error = xfs_iflush_cluster(ip, bp);
+ if (error)
+ goto cluster_corrupt_out;
+
+ if (flags & SYNC_WAIT)
+ error = xfs_bwrite(mp, bp);
+ else
+ xfs_bdwrite(mp, bp);
+ return error;
+
+corrupt_out:
+ xfs_buf_relse(bp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+cluster_corrupt_out:
+ /*
+ * Unlocks the flush lock
+ */
+ xfs_iflush_abort(ip);
+ return XFS_ERROR(EFSCORRUPTED);
+}
+
+
+STATIC int
+xfs_iflush_int(
+ xfs_inode_t *ip,
+ xfs_buf_t *bp)
+{
+ xfs_inode_log_item_t *iip;
+ xfs_dinode_t *dip;
+ xfs_mount_t *mp;
+#ifdef XFS_TRANS_DEBUG
+ int first;
+#endif
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+ ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+ ip->i_d.di_nextents > ip->i_df.if_ext_max);
+
+ iip = ip->i_itemp;
+ mp = ip->i_mount;
+
+ /* set *dip = inode's place in the buffer */
+ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+
+ /*
+ * Clear i_update_core before copying out the data.
+ * This is for coordination with our timestamp updates
+ * that don't hold the inode lock. They will always
+ * update the timestamps BEFORE setting i_update_core,
+ * so if we clear i_update_core after they set it we
+ * are guaranteed to see their updates to the timestamps.
+ * I believe that this depends on strongly ordered memory
+ * semantics, but we have that. We use the SYNCHRONIZE
+ * macro to make sure that the compiler does not reorder
+ * the i_update_core access below the data copy below.
+ */
+ ip->i_update_core = 0;
+ SYNCHRONIZE();
+
+ /*
+ * Make sure to get the latest timestamps from the Linux inode.
+ */
+ xfs_synchronize_times(ip);
+
+ if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
+ mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+ __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
+ goto corrupt_out;
+ }
+ if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
+ mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+ __func__, ip->i_ino, ip, ip->i_d.di_magic);
+ goto corrupt_out;
+ }
+ if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+ if (XFS_TEST_ERROR(
+ (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: Bad regular inode %Lu, ptr 0x%p",
+ __func__, ip->i_ino, ip);
+ goto corrupt_out;
+ }
+ } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+ if (XFS_TEST_ERROR(
+ (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+ (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
+ mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: Bad directory inode %Lu, ptr 0x%p",
+ __func__, ip->i_ino, ip);
+ goto corrupt_out;
+ }
+ }
+ if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
+ ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
+ XFS_RANDOM_IFLUSH_5)) {
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: detected corrupt incore inode %Lu, "
+ "total extents = %d, nblocks = %Ld, ptr 0x%p",
+ __func__, ip->i_ino,
+ ip->i_d.di_nextents + ip->i_d.di_anextents,
+ ip->i_d.di_nblocks, ip);
+ goto corrupt_out;
+ }
+ if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
+ mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
+ xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+ "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+ __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
+ goto corrupt_out;
+ }
+ /*
+ * bump the flush iteration count, used to detect flushes which
+ * postdate a log record during recovery.
+ */
+
+ ip->i_d.di_flushiter++;
+
+ /*
+ * Copy the dirty parts of the inode into the on-disk
+ * inode. We always copy out the core of the inode,
+ * because if the inode is dirty at all the core must
+ * be.
+ */
+ xfs_dinode_to_disk(dip, &ip->i_d);
+
+ /* Wrap, we never let the log put out DI_MAX_FLUSH */
+ if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
+ ip->i_d.di_flushiter = 0;
+
+ /*
+ * If this is really an old format inode and the superblock version
+ * has not been updated to support only new format inodes, then
+ * convert back to the old inode format. If the superblock version
+ * has been updated, then make the conversion permanent.
+ */
+ ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
+ if (ip->i_d.di_version == 1) {
+ if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+ /*
+ * Convert it back.
+ */
+ ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+ dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
+ } else {
+ /*
+ * The superblock version has already been bumped,
+ * so just make the conversion to the new inode
+ * format permanent.
+ */
+ ip->i_d.di_version = 2;
+ dip->di_version = 2;
+ ip->i_d.di_onlink = 0;
+ dip->di_onlink = 0;
+ memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+ memset(&(dip->di_pad[0]), 0,
+ sizeof(dip->di_pad));
+ ASSERT(xfs_get_projid(ip) == 0);
+ }
+ }
+
+ xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
+ if (XFS_IFORK_Q(ip))
+ xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
+ xfs_inobp_check(mp, bp);
+
+ /*
+ * We've recorded everything logged in the inode, so we'd
+ * like to clear the ilf_fields bits so we don't log and
+ * flush things unnecessarily. However, we can't stop
+ * logging all this information until the data we've copied
+ * into the disk buffer is written to disk. If we did we might
+ * overwrite the copy of the inode in the log with all the
+ * data after re-logging only part of it, and in the face of
+ * a crash we wouldn't have all the data we need to recover.
+ *
+ * What we do is move the bits to the ili_last_fields field.
+ * When logging the inode, these bits are moved back to the
+ * ilf_fields field. In the xfs_iflush_done() routine we
+ * clear ili_last_fields, since we know that the information
+ * those bits represent is permanently on disk. As long as
+ * the flush completes before the inode is logged again, then
+ * both ilf_fields and ili_last_fields will be cleared.
+ *
+ * We can play with the ilf_fields bits here, because the inode
+ * lock must be held exclusively in order to set bits there
+ * and the flush lock protects the ili_last_fields bits.
+ * Set ili_logged so the flush done
+ * routine can tell whether or not to look in the AIL.
+ * Also, store the current LSN of the inode so that we can tell
+ * whether the item has moved in the AIL from xfs_iflush_done().
+ * In order to read the lsn we need the AIL lock, because
+ * it is a 64 bit value that cannot be read atomically.
+ */
+ if (iip != NULL && iip->ili_format.ilf_fields != 0) {
+ iip->ili_last_fields = iip->ili_format.ilf_fields;
+ iip->ili_format.ilf_fields = 0;
+ iip->ili_logged = 1;
+
+ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
+
+ /*
+ * Attach the function xfs_iflush_done to the inode's
+ * buffer. This will remove the inode from the AIL
+ * and unlock the inode's flush lock when the inode is
+ * completely written to disk.
+ */
+ xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
+
+ ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+ ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
+ } else {
+ /*
+ * We're flushing an inode which is not in the AIL and has
+ * not been logged but has i_update_core set. For this
+ * case we can use a B_DELWRI flush and immediately drop
+ * the inode flush lock because we can avoid the whole
+ * AIL state thing. It's OK to drop the flush lock now,
+ * because we've already locked the buffer and to do anything
+ * you really need both.
+ */
+ if (iip != NULL) {
+ ASSERT(iip->ili_logged == 0);
+ ASSERT(iip->ili_last_fields == 0);
+ ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
+ }
+ xfs_ifunlock(ip);
+ }
+
+ return 0;
+
+corrupt_out:
+ return XFS_ERROR(EFSCORRUPTED);
+}
+
+void
+xfs_promote_inode(
+ struct xfs_inode *ip)
+{
+ struct xfs_buf *bp;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+
+ bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
+ ip->i_imap.im_len, XBF_TRYLOCK);
+ if (!bp)
+ return;
+
+ if (XFS_BUF_ISDELAYWRITE(bp)) {
+ xfs_buf_delwri_promote(bp);
+ wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
+ }
+
+ xfs_buf_relse(bp);
+}
+
+/*
+ * Return a pointer to the extent record at file index idx.
+ */
+xfs_bmbt_rec_host_t *
+xfs_iext_get_ext(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t idx) /* index of target extent */
+{
+ ASSERT(idx >= 0);
+ ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+
+ if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
+ return ifp->if_u1.if_ext_irec->er_extbuf;
+ } else if (ifp->if_flags & XFS_IFEXTIREC) {
+ xfs_ext_irec_t *erp; /* irec pointer */
+ int erp_idx = 0; /* irec index */
+ xfs_extnum_t page_idx = idx; /* ext index in target list */
+
+ erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
+ return &erp->er_extbuf[page_idx];
+ } else if (ifp->if_bytes) {
+ return &ifp->if_u1.if_extents[idx];
+ } else {
+ return NULL;
+ }
+}
+
+/*
+ * Insert new item(s) into the extent records for incore inode
+ * fork 'ifp'. 'count' new items are inserted at index 'idx'.
+ */
+void
+xfs_iext_insert(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t idx, /* starting index of new items */
+ xfs_extnum_t count, /* number of inserted items */
+ xfs_bmbt_irec_t *new, /* items to insert */
+ int state) /* type of extent conversion */
+{
+ xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+ xfs_extnum_t i; /* extent record index */
+
+ trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
+
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ xfs_iext_add(ifp, idx, count);
+ for (i = idx; i < idx + count; i++, new++)
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be increased. The ext_diff parameter stores the
+ * number of new extents being added and the idx parameter contains
+ * the extent index where the new extents will be added. If the new
+ * extents are being appended, then we just need to (re)allocate and
+ * initialize the space. Otherwise, if the new extents are being
+ * inserted into the middle of the existing entries, a bit more work
+ * is required to make room for the new extents to be inserted. The
+ * caller is responsible for filling in the new extent entries upon
+ * return.
+ */
+void
+xfs_iext_add(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t idx, /* index to begin adding exts */
+ int ext_diff) /* number of extents to add */
+{
+ int byte_diff; /* new bytes being added */
+ int new_size; /* size of extents after adding */
+ xfs_extnum_t nextents; /* number of extents in file */
+
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ ASSERT((idx >= 0) && (idx <= nextents));
+ byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
+ new_size = ifp->if_bytes + byte_diff;
+ /*
+ * If the new number of extents (nextents + ext_diff)
+ * fits inside the inode, then continue to use the inline
+ * extent buffer.
+ */
+ if (nextents + ext_diff <= XFS_INLINE_EXTS) {
+ if (idx < nextents) {
+ memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
+ &ifp->if_u2.if_inline_ext[idx],
+ (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+ memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
+ }
+ ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+ ifp->if_real_bytes = 0;
+ }
+ /*
+ * Otherwise use a linear (direct) extent list.
+ * If the extents are currently inside the inode,
+ * xfs_iext_realloc_direct will switch us from
+ * inline to direct extent allocation mode.
+ */
+ else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
+ xfs_iext_realloc_direct(ifp, new_size);
+ if (idx < nextents) {
+ memmove(&ifp->if_u1.if_extents[idx + ext_diff],
+ &ifp->if_u1.if_extents[idx],
+ (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+ memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
+ }
+ }
+ /* Indirection array */
+ else {
+ xfs_ext_irec_t *erp;
+ int erp_idx = 0;
+ int page_idx = idx;
+
+ ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
+ if (ifp->if_flags & XFS_IFEXTIREC) {
+ erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
+ } else {
+ xfs_iext_irec_init(ifp);
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ erp = ifp->if_u1.if_ext_irec;
+ }
+ /* Extents fit in target extent page */
+ if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
+ if (page_idx < erp->er_extcount) {
+ memmove(&erp->er_extbuf[page_idx + ext_diff],
+ &erp->er_extbuf[page_idx],
+ (erp->er_extcount - page_idx) *
+ sizeof(xfs_bmbt_rec_t));
+ memset(&erp->er_extbuf[page_idx], 0, byte_diff);
+ }
+ erp->er_extcount += ext_diff;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+ }
+ /* Insert a new extent page */
+ else if (erp) {
+ xfs_iext_add_indirect_multi(ifp,
+ erp_idx, page_idx, ext_diff);
+ }
+ /*
+ * If extent(s) are being appended to the last page in
+ * the indirection array and the new extent(s) don't fit
+ * in the page, then erp is NULL and erp_idx is set to
+ * the next index needed in the indirection array.
+ */
+ else {
+ int count = ext_diff;
+
+ while (count) {
+ erp = xfs_iext_irec_new(ifp, erp_idx);
+ erp->er_extcount = count;
+ count -= MIN(count, (int)XFS_LINEAR_EXTS);
+ if (count) {
+ erp_idx++;
+ }
+ }
+ }
+ }
+ ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being added to the indirection
+ * array and the new extents do not fit in the target extent list. The
+ * erp_idx parameter contains the irec index for the target extent list
+ * in the indirection array, and the idx parameter contains the extent
+ * index within the list. The number of extents being added is stored
+ * in the count parameter.
+ *
+ * |-------| |-------|
+ * | | | | idx - number of extents before idx
+ * | idx | | count |
+ * | | | | count - number of extents being inserted at idx
+ * |-------| |-------|
+ * | count | | nex2 | nex2 - number of extents after idx + count
+ * |-------| |-------|
+ */
+void
+xfs_iext_add_indirect_multi(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int erp_idx, /* target extent irec index */
+ xfs_extnum_t idx, /* index within target list */
+ int count) /* new extents being added */
+{
+ int byte_diff; /* new bytes being added */
+ xfs_ext_irec_t *erp; /* pointer to irec entry */
+ xfs_extnum_t ext_diff; /* number of extents to add */
+ xfs_extnum_t ext_cnt; /* new extents still needed */
+ xfs_extnum_t nex2; /* extents after idx + count */
+ xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */
+ int nlists; /* number of irec's (lists) */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ nex2 = erp->er_extcount - idx;
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+ /*
+ * Save second part of target extent list
+ * (all extents past */
+ if (nex2) {
+ byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+ nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
+ memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
+ erp->er_extcount -= nex2;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
+ memset(&erp->er_extbuf[idx], 0, byte_diff);
+ }
+
+ /*
+ * Add the new extents to the end of the target
+ * list, then allocate new irec record(s) and
+ * extent buffer(s) as needed to store the rest
+ * of the new extents.
+ */
+ ext_cnt = count;
+ ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
+ if (ext_diff) {
+ erp->er_extcount += ext_diff;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+ ext_cnt -= ext_diff;
+ }
+ while (ext_cnt) {
+ erp_idx++;
+ erp = xfs_iext_irec_new(ifp, erp_idx);
+ ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
+ erp->er_extcount = ext_diff;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+ ext_cnt -= ext_diff;
+ }
+
+ /* Add nex2 extents back to indirection array */
+ if (nex2) {
+ xfs_extnum_t ext_avail;
+ int i;
+
+ byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+ ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
+ i = 0;
+ /*
+ * If nex2 extents fit in the current page, append
+ * nex2_ep after the new extents.
+ */
+ if (nex2 <= ext_avail) {
+ i = erp->er_extcount;
+ }
+ /*
+ * Otherwise, check if space is available in the
+ * next page.
+ */
+ else if ((erp_idx < nlists - 1) &&
+ (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
+ ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
+ erp_idx++;
+ erp++;
+ /* Create a hole for nex2 extents */
+ memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
+ erp->er_extcount * sizeof(xfs_bmbt_rec_t));
+ }
+ /*
+ * Final choice, create a new extent page for
+ * nex2 extents.
+ */
+ else {
+ erp_idx++;
+ erp = xfs_iext_irec_new(ifp, erp_idx);
+ }
+ memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
+ kmem_free(nex2_ep);
+ erp->er_extcount += nex2;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
+ }
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be decreased. The ext_diff parameter stores the
+ * number of extents to be removed and the idx parameter contains
+ * the extent index where the extents will be removed from.
+ *
+ * If the amount of space needed has decreased below the linear
+ * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
+ * extent array. Otherwise, use kmem_realloc() to adjust the
+ * size to what is needed.
+ */
+void
+xfs_iext_remove(
+ xfs_inode_t *ip, /* incore inode pointer */
+ xfs_extnum_t idx, /* index to begin removing exts */
+ int ext_diff, /* number of extents to remove */
+ int state) /* type of extent conversion */
+{
+ xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+ xfs_extnum_t nextents; /* number of extents in file */
+ int new_size; /* size of extents after removal */
+
+ trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
+
+ ASSERT(ext_diff > 0);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
+
+ if (new_size == 0) {
+ xfs_iext_destroy(ifp);
+ } else if (ifp->if_flags & XFS_IFEXTIREC) {
+ xfs_iext_remove_indirect(ifp, idx, ext_diff);
+ } else if (ifp->if_real_bytes) {
+ xfs_iext_remove_direct(ifp, idx, ext_diff);
+ } else {
+ xfs_iext_remove_inline(ifp, idx, ext_diff);
+ }
+ ifp->if_bytes = new_size;
+}
+
+/*
+ * This removes ext_diff extents from the inline buffer, beginning
+ * at extent index idx.
+ */
+void
+xfs_iext_remove_inline(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t idx, /* index to begin removing exts */
+ int ext_diff) /* number of extents to remove */
+{
+ int nextents; /* number of extents in file */
+
+ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+ ASSERT(idx < XFS_INLINE_EXTS);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ ASSERT(((nextents - ext_diff) > 0) &&
+ (nextents - ext_diff) < XFS_INLINE_EXTS);
+
+ if (idx + ext_diff < nextents) {
+ memmove(&ifp->if_u2.if_inline_ext[idx],
+ &ifp->if_u2.if_inline_ext[idx + ext_diff],
+ (nextents - (idx + ext_diff)) *
+ sizeof(xfs_bmbt_rec_t));
+ memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
+ 0, ext_diff * sizeof(xfs_bmbt_rec_t));
+ } else {
+ memset(&ifp->if_u2.if_inline_ext[idx], 0,
+ ext_diff * sizeof(xfs_bmbt_rec_t));
+ }
+}
+
+/*
+ * This removes ext_diff extents from a linear (direct) extent list,
+ * beginning at extent index idx. If the extents are being removed
+ * from the end of the list (ie. truncate) then we just need to re-
+ * allocate the list to remove the extra space. Otherwise, if the
+ * extents are being removed from the middle of the existing extent
+ * entries, then we first need to move the extent records beginning
+ * at idx + ext_diff up in the list to overwrite the records being
+ * removed, then remove the extra space via kmem_realloc.
+ */
+void
+xfs_iext_remove_direct(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t idx, /* index to begin removing exts */
+ int ext_diff) /* number of extents to remove */
+{
+ xfs_extnum_t nextents; /* number of extents in file */
+ int new_size; /* size of extents after removal */
+
+ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+ new_size = ifp->if_bytes -
+ (ext_diff * sizeof(xfs_bmbt_rec_t));
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+ if (new_size == 0) {
+ xfs_iext_destroy(ifp);
+ return;
+ }
+ /* Move extents up in the list (if needed) */
+ if (idx + ext_diff < nextents) {
+ memmove(&ifp->if_u1.if_extents[idx],
+ &ifp->if_u1.if_extents[idx + ext_diff],
+ (nextents - (idx + ext_diff)) *
+ sizeof(xfs_bmbt_rec_t));
+ }
+ memset(&ifp->if_u1.if_extents[nextents - ext_diff],
+ 0, ext_diff * sizeof(xfs_bmbt_rec_t));
+ /*
+ * Reallocate the direct extent list. If the extents
+ * will fit inside the inode then xfs_iext_realloc_direct
+ * will switch from direct to inline extent allocation
+ * mode for us.
+ */
+ xfs_iext_realloc_direct(ifp, new_size);
+ ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being removed from the
+ * indirection array and the extents being removed span multiple extent
+ * buffers. The idx parameter contains the file extent index where we
+ * want to begin removing extents, and the count parameter contains
+ * how many extents need to be removed.
+ *
+ * |-------| |-------|
+ * | nex1 | | | nex1 - number of extents before idx
+ * |-------| | count |
+ * | | | | count - number of extents being removed at idx
+ * | count | |-------|
+ * | | | nex2 | nex2 - number of extents after idx + count
+ * |-------| |-------|
+ */
+void
+xfs_iext_remove_indirect(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t idx, /* index to begin removing extents */
+ int count) /* number of extents to remove */
+{
+ xfs_ext_irec_t *erp; /* indirection array pointer */
+ int erp_idx = 0; /* indirection array index */
+ xfs_extnum_t ext_cnt; /* extents left to remove */
+ xfs_extnum_t ext_diff; /* extents to remove in current list */
+ xfs_extnum_t nex1; /* number of extents before idx */
+ xfs_extnum_t nex2; /* extents after idx + count */
+ int page_idx = idx; /* index in target extent list */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
+ ASSERT(erp != NULL);
+ nex1 = page_idx;
+ ext_cnt = count;
+ while (ext_cnt) {
+ nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
+ ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
+ /*
+ * Check for deletion of entire list;
+ * xfs_iext_irec_remove() updates extent offsets.
+ */
+ if (ext_diff == erp->er_extcount) {
+ xfs_iext_irec_remove(ifp, erp_idx);
+ ext_cnt -= ext_diff;
+ nex1 = 0;
+ if (ext_cnt) {
+ ASSERT(erp_idx < ifp->if_real_bytes /
+ XFS_IEXT_BUFSZ);
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ nex1 = 0;
+ continue;
+ } else {
+ break;
+ }
+ }
+ /* Move extents up (if needed) */
+ if (nex2) {
+ memmove(&erp->er_extbuf[nex1],
+ &erp->er_extbuf[nex1 + ext_diff],
+ nex2 * sizeof(xfs_bmbt_rec_t));
+ }
+ /* Zero out rest of page */
+ memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
+ ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
+ /* Update remaining counters */
+ erp->er_extcount -= ext_diff;
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
+ ext_cnt -= ext_diff;
+ nex1 = 0;
+ erp_idx++;
+ erp++;
+ }
+ ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
+ xfs_iext_irec_compact(ifp);
+}
+
+/*
+ * Create, destroy, or resize a linear (direct) block of extents.
+ */
+void
+xfs_iext_realloc_direct(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int new_size) /* new size of extents */
+{
+ int rnew_size; /* real new size of extents */
+
+ rnew_size = new_size;
+
+ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
+ ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
+ (new_size != ifp->if_real_bytes)));
+
+ /* Free extent records */
+ if (new_size == 0) {
+ xfs_iext_destroy(ifp);
+ }
+ /* Resize direct extent list and zero any new bytes */
+ else if (ifp->if_real_bytes) {
+ /* Check if extents will fit inside the inode */
+ if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
+ xfs_iext_direct_to_inline(ifp, new_size /
+ (uint)sizeof(xfs_bmbt_rec_t));
+ ifp->if_bytes = new_size;
+ return;
+ }
+ if (!is_power_of_2(new_size)){
+ rnew_size = roundup_pow_of_two(new_size);
+ }
+ if (rnew_size != ifp->if_real_bytes) {
+ ifp->if_u1.if_extents =
+ kmem_realloc(ifp->if_u1.if_extents,
+ rnew_size,
+ ifp->if_real_bytes, KM_NOFS);
+ }
+ if (rnew_size > ifp->if_real_bytes) {
+ memset(&ifp->if_u1.if_extents[ifp->if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t)], 0,
+ rnew_size - ifp->if_real_bytes);
+ }
+ }
+ /*
+ * Switch from the inline extent buffer to a direct
+ * extent list. Be sure to include the inline extent
+ * bytes in new_size.
+ */
+ else {
+ new_size += ifp->if_bytes;
+ if (!is_power_of_2(new_size)) {
+ rnew_size = roundup_pow_of_two(new_size);
+ }
+ xfs_iext_inline_to_direct(ifp, rnew_size);
+ }
+ ifp->if_real_bytes = rnew_size;
+ ifp->if_bytes = new_size;
+}
+
+/*
+ * Switch from linear (direct) extent records to inline buffer.
+ */
+void
+xfs_iext_direct_to_inline(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t nextents) /* number of extents in file */
+{
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ ASSERT(nextents <= XFS_INLINE_EXTS);
+ /*
+ * The inline buffer was zeroed when we switched
+ * from inline to direct extent allocation mode,
+ * so we don't need to clear it here.
+ */
+ memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
+ nextents * sizeof(xfs_bmbt_rec_t));
+ kmem_free(ifp->if_u1.if_extents);
+ ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+ ifp->if_real_bytes = 0;
+}
+
+/*
+ * Switch from inline buffer to linear (direct) extent records.
+ * new_size should already be rounded up to the next power of 2
+ * by the caller (when appropriate), so use new_size as it is.
+ * However, since new_size may be rounded up, we can't update
+ * if_bytes here. It is the caller's responsibility to update
+ * if_bytes upon return.
+ */
+void
+xfs_iext_inline_to_direct(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int new_size) /* number of extents in file */
+{
+ ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
+ memset(ifp->if_u1.if_extents, 0, new_size);
+ if (ifp->if_bytes) {
+ memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
+ ifp->if_bytes);
+ memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+ sizeof(xfs_bmbt_rec_t));
+ }
+ ifp->if_real_bytes = new_size;
+}
+
+/*
+ * Resize an extent indirection array to new_size bytes.
+ */
+STATIC void
+xfs_iext_realloc_indirect(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int new_size) /* new indirection array size */
+{
+ int nlists; /* number of irec's (ex lists) */
+ int size; /* current indirection array size */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ size = nlists * sizeof(xfs_ext_irec_t);
+ ASSERT(ifp->if_real_bytes);
+ ASSERT((new_size >= 0) && (new_size != size));
+ if (new_size == 0) {
+ xfs_iext_destroy(ifp);
+ } else {
+ ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
+ kmem_realloc(ifp->if_u1.if_ext_irec,
+ new_size, size, KM_NOFS);
+ }
+}
+
+/*
+ * Switch from indirection array to linear (direct) extent allocations.
+ */
+STATIC void
+xfs_iext_indirect_to_direct(
+ xfs_ifork_t *ifp) /* inode fork pointer */
+{
+ xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ xfs_extnum_t nextents; /* number of extents in file */
+ int size; /* size of file extents */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ ASSERT(nextents <= XFS_LINEAR_EXTS);
+ size = nextents * sizeof(xfs_bmbt_rec_t);
+
+ xfs_iext_irec_compact_pages(ifp);
+ ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
+
+ ep = ifp->if_u1.if_ext_irec->er_extbuf;
+ kmem_free(ifp->if_u1.if_ext_irec);
+ ifp->if_flags &= ~XFS_IFEXTIREC;
+ ifp->if_u1.if_extents = ep;
+ ifp->if_bytes = size;
+ if (nextents < XFS_LINEAR_EXTS) {
+ xfs_iext_realloc_direct(ifp, size);
+ }
+}
+
+/*
+ * Free incore file extents.
+ */
+void
+xfs_iext_destroy(
+ xfs_ifork_t *ifp) /* inode fork pointer */
+{
+ if (ifp->if_flags & XFS_IFEXTIREC) {
+ int erp_idx;
+ int nlists;
+
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
+ xfs_iext_irec_remove(ifp, erp_idx);
+ }
+ ifp->if_flags &= ~XFS_IFEXTIREC;
+ } else if (ifp->if_real_bytes) {
+ kmem_free(ifp->if_u1.if_extents);
+ } else if (ifp->if_bytes) {
+ memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+ sizeof(xfs_bmbt_rec_t));
+ }
+ ifp->if_u1.if_extents = NULL;
+ ifp->if_real_bytes = 0;
+ ifp->if_bytes = 0;
+}
+
+/*
+ * Return a pointer to the extent record for file system block bno.
+ */
+xfs_bmbt_rec_host_t * /* pointer to found extent record */
+xfs_iext_bno_to_ext(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fileoff_t bno, /* block number to search for */
+ xfs_extnum_t *idxp) /* index of target extent */
+{
+ xfs_bmbt_rec_host_t *base; /* pointer to first extent */
+ xfs_filblks_t blockcount = 0; /* number of blocks in extent */
+ xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
+ xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
+ int high; /* upper boundary in search */
+ xfs_extnum_t idx = 0; /* index of target extent */
+ int low; /* lower boundary in search */
+ xfs_extnum_t nextents; /* number of file extents */
+ xfs_fileoff_t startoff = 0; /* start offset of extent */
+
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ if (nextents == 0) {
+ *idxp = 0;
+ return NULL;
+ }
+ low = 0;
+ if (ifp->if_flags & XFS_IFEXTIREC) {
+ /* Find target extent list */
+ int erp_idx = 0;
+ erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
+ base = erp->er_extbuf;
+ high = erp->er_extcount - 1;
+ } else {
+ base = ifp->if_u1.if_extents;
+ high = nextents - 1;
+ }
+ /* Binary search extent records */
+ while (low <= high) {
+ idx = (low + high) >> 1;
+ ep = base + idx;
+ startoff = xfs_bmbt_get_startoff(ep);
+ blockcount = xfs_bmbt_get_blockcount(ep);
+ if (bno < startoff) {
+ high = idx - 1;
+ } else if (bno >= startoff + blockcount) {
+ low = idx + 1;
+ } else {
+ /* Convert back to file-based extent index */
+ if (ifp->if_flags & XFS_IFEXTIREC) {
+ idx += erp->er_extoff;
+ }
+ *idxp = idx;
+ return ep;
+ }
+ }
+ /* Convert back to file-based extent index */
+ if (ifp->if_flags & XFS_IFEXTIREC) {
+ idx += erp->er_extoff;
+ }
+ if (bno >= startoff + blockcount) {
+ if (++idx == nextents) {
+ ep = NULL;
+ } else {
+ ep = xfs_iext_get_ext(ifp, idx);
+ }
+ }
+ *idxp = idx;
+ return ep;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record for filesystem block bno. Store the index of the
+ * target irec in *erp_idxp.
+ */
+xfs_ext_irec_t * /* pointer to found extent record */
+xfs_iext_bno_to_irec(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_fileoff_t bno, /* block number to search for */
+ int *erp_idxp) /* irec index of target ext list */
+{
+ xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
+ xfs_ext_irec_t *erp_next; /* next indirection array entry */
+ int erp_idx; /* indirection array index */
+ int nlists; /* number of extent irec's (lists) */
+ int high; /* binary search upper limit */
+ int low; /* binary search lower limit */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ erp_idx = 0;
+ low = 0;
+ high = nlists - 1;
+ while (low <= high) {
+ erp_idx = (low + high) >> 1;
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
+ if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
+ high = erp_idx - 1;
+ } else if (erp_next && bno >=
+ xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
+ low = erp_idx + 1;
+ } else {
+ break;
+ }
+ }
+ *erp_idxp = erp_idx;
+ return erp;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record at file extent index *idxp. Store the index of the
+ * target irec in *erp_idxp and store the page index of the target
+ * extent record in *idxp.
+ */
+xfs_ext_irec_t *
+xfs_iext_idx_to_irec(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ xfs_extnum_t *idxp, /* extent index (file -> page) */
+ int *erp_idxp, /* pointer to target irec */
+ int realloc) /* new bytes were just added */
+{
+ xfs_ext_irec_t *prev; /* pointer to previous irec */
+ xfs_ext_irec_t *erp = NULL; /* pointer to current irec */
+ int erp_idx; /* indirection array index */
+ int nlists; /* number of irec's (ex lists) */
+ int high; /* binary search upper limit */
+ int low; /* binary search lower limit */
+ xfs_extnum_t page_idx = *idxp; /* extent index in target list */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ ASSERT(page_idx >= 0);
+ ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+ ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ erp_idx = 0;
+ low = 0;
+ high = nlists - 1;
+
+ /* Binary search extent irec's */
+ while (low <= high) {
+ erp_idx = (low + high) >> 1;
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ prev = erp_idx > 0 ? erp - 1 : NULL;
+ if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
+ realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
+ high = erp_idx - 1;
+ } else if (page_idx > erp->er_extoff + erp->er_extcount ||
+ (page_idx == erp->er_extoff + erp->er_extcount &&
+ !realloc)) {
+ low = erp_idx + 1;
+ } else if (page_idx == erp->er_extoff + erp->er_extcount &&
+ erp->er_extcount == XFS_LINEAR_EXTS) {
+ ASSERT(realloc);
+ page_idx = 0;
+ erp_idx++;
+ erp = erp_idx < nlists ? erp + 1 : NULL;
+ break;
+ } else {
+ page_idx -= erp->er_extoff;
+ break;
+ }
+ }
+ *idxp = page_idx;
+ *erp_idxp = erp_idx;
+ return(erp);
+}
+
+/*
+ * Allocate and initialize an indirection array once the space needed
+ * for incore extents increases above XFS_IEXT_BUFSZ.
+ */
+void
+xfs_iext_irec_init(
+ xfs_ifork_t *ifp) /* inode fork pointer */
+{
+ xfs_ext_irec_t *erp; /* indirection array pointer */
+ xfs_extnum_t nextents; /* number of extents in file */
+
+ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ ASSERT(nextents <= XFS_LINEAR_EXTS);
+
+ erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
+
+ if (nextents == 0) {
+ ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+ } else if (!ifp->if_real_bytes) {
+ xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
+ } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
+ xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
+ }
+ erp->er_extbuf = ifp->if_u1.if_extents;
+ erp->er_extcount = nextents;
+ erp->er_extoff = 0;
+
+ ifp->if_flags |= XFS_IFEXTIREC;
+ ifp->if_real_bytes = XFS_IEXT_BUFSZ;
+ ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
+ ifp->if_u1.if_ext_irec = erp;
+
+ return;
+}
+
+/*
+ * Allocate and initialize a new entry in the indirection array.
+ */
+xfs_ext_irec_t *
+xfs_iext_irec_new(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int erp_idx) /* index for new irec */
+{
+ xfs_ext_irec_t *erp; /* indirection array pointer */
+ int i; /* loop counter */
+ int nlists; /* number of irec's (ex lists) */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+ /* Resize indirection array */
+ xfs_iext_realloc_indirect(ifp, ++nlists *
+ sizeof(xfs_ext_irec_t));
+ /*
+ * Move records down in the array so the
+ * new page can use erp_idx.
+ */
+ erp = ifp->if_u1.if_ext_irec;
+ for (i = nlists - 1; i > erp_idx; i--) {
+ memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
+ }
+ ASSERT(i == erp_idx);
+
+ /* Initialize new extent record */
+ erp = ifp->if_u1.if_ext_irec;
+ erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+ ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+ memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
+ erp[erp_idx].er_extcount = 0;
+ erp[erp_idx].er_extoff = erp_idx > 0 ?
+ erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
+ return (&erp[erp_idx]);
+}
+
+/*
+ * Remove a record from the indirection array.
+ */
+void
+xfs_iext_irec_remove(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int erp_idx) /* irec index to remove */
+{
+ xfs_ext_irec_t *erp; /* indirection array pointer */
+ int i; /* loop counter */
+ int nlists; /* number of irec's (ex lists) */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ if (erp->er_extbuf) {
+ xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
+ -erp->er_extcount);
+ kmem_free(erp->er_extbuf);
+ }
+ /* Compact extent records */
+ erp = ifp->if_u1.if_ext_irec;
+ for (i = erp_idx; i < nlists - 1; i++) {
+ memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
+ }
+ /*
+ * Manually free the last extent record from the indirection
+ * array. A call to xfs_iext_realloc_indirect() with a size
+ * of zero would result in a call to xfs_iext_destroy() which
+ * would in turn call this function again, creating a nasty
+ * infinite loop.
+ */
+ if (--nlists) {
+ xfs_iext_realloc_indirect(ifp,
+ nlists * sizeof(xfs_ext_irec_t));
+ } else {
+ kmem_free(ifp->if_u1.if_ext_irec);
+ }
+ ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+}
+
+/*
+ * This is called to clean up large amounts of unused memory allocated
+ * by the indirection array. Before compacting anything though, verify
+ * that the indirection array is still needed and switch back to the
+ * linear extent list (or even the inline buffer) if possible. The
+ * compaction policy is as follows:
+ *
+ * Full Compaction: Extents fit into a single page (or inline buffer)
+ * Partial Compaction: Extents occupy less than 50% of allocated space
+ * No Compaction: Extents occupy at least 50% of allocated space
+ */
+void
+xfs_iext_irec_compact(
+ xfs_ifork_t *ifp) /* inode fork pointer */
+{
+ xfs_extnum_t nextents; /* number of extents in file */
+ int nlists; /* number of irec's (ex lists) */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+ if (nextents == 0) {
+ xfs_iext_destroy(ifp);
+ } else if (nextents <= XFS_INLINE_EXTS) {
+ xfs_iext_indirect_to_direct(ifp);
+ xfs_iext_direct_to_inline(ifp, nextents);
+ } else if (nextents <= XFS_LINEAR_EXTS) {
+ xfs_iext_indirect_to_direct(ifp);
+ } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
+ xfs_iext_irec_compact_pages(ifp);
+ }
+}
+
+/*
+ * Combine extents from neighboring extent pages.
+ */
+void
+xfs_iext_irec_compact_pages(
+ xfs_ifork_t *ifp) /* inode fork pointer */
+{
+ xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */
+ int erp_idx = 0; /* indirection array index */
+ int nlists; /* number of irec's (ex lists) */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ while (erp_idx < nlists - 1) {
+ erp = &ifp->if_u1.if_ext_irec[erp_idx];
+ erp_next = erp + 1;
+ if (erp_next->er_extcount <=
+ (XFS_LINEAR_EXTS - erp->er_extcount)) {
+ memcpy(&erp->er_extbuf[erp->er_extcount],
+ erp_next->er_extbuf, erp_next->er_extcount *
+ sizeof(xfs_bmbt_rec_t));
+ erp->er_extcount += erp_next->er_extcount;
+ /*
+ * Free page before removing extent record
+ * so er_extoffs don't get modified in
+ * xfs_iext_irec_remove.
+ */
+ kmem_free(erp_next->er_extbuf);
+ erp_next->er_extbuf = NULL;
+ xfs_iext_irec_remove(ifp, erp_idx + 1);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ } else {
+ erp_idx++;
+ }
+ }
+}
+
+/*
+ * This is called to update the er_extoff field in the indirection
+ * array when extents have been added or removed from one of the
+ * extent lists. erp_idx contains the irec index to begin updating
+ * at and ext_diff contains the number of extents that were added
+ * or removed.
+ */
+void
+xfs_iext_irec_update_extoffs(
+ xfs_ifork_t *ifp, /* inode fork pointer */
+ int erp_idx, /* irec index to update */
+ int ext_diff) /* number of new extents */
+{
+ int i; /* loop counter */
+ int nlists; /* number of irec's (ex lists */
+
+ ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+ nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+ for (i = erp_idx; i < nlists; i++) {
+ ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
+ }
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
new file mode 100644
index 00000000..28b35964
--- /dev/null
+++ b/fs/xfs/xfs_inode.h
@@ -0,0 +1,600 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_INODE_H__
+#define __XFS_INODE_H__
+
+struct posix_acl;
+struct xfs_dinode;
+struct xfs_inode;
+
+/*
+ * Fork identifiers.
+ */
+#define XFS_DATA_FORK 0
+#define XFS_ATTR_FORK 1
+
+/*
+ * The following xfs_ext_irec_t struct introduces a second (top) level
+ * to the in-core extent allocation scheme. These structs are allocated
+ * in a contiguous block, creating an indirection array where each entry
+ * (irec) contains a pointer to a buffer of in-core extent records which
+ * it manages. Each extent buffer is 4k in size, since 4k is the system
+ * page size on Linux i386 and systems with larger page sizes don't seem
+ * to gain much, if anything, by using their native page size as the
+ * extent buffer size. Also, using 4k extent buffers everywhere provides
+ * a consistent interface for CXFS across different platforms.
+ *
+ * There is currently no limit on the number of irec's (extent lists)
+ * allowed, so heavily fragmented files may require an indirection array
+ * which spans multiple system pages of memory. The number of extents
+ * which would require this amount of contiguous memory is very large
+ * and should not cause problems in the foreseeable future. However,
+ * if the memory needed for the contiguous array ever becomes a problem,
+ * it is possible that a third level of indirection may be required.
+ */
+typedef struct xfs_ext_irec {
+ xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
+ xfs_extnum_t er_extoff; /* extent offset in file */
+ xfs_extnum_t er_extcount; /* number of extents in page/block */
+} xfs_ext_irec_t;
+
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+#define XFS_IEXT_BUFSZ 4096
+#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
+#define XFS_INLINE_EXTS 2
+#define XFS_INLINE_DATA 32
+typedef struct xfs_ifork {
+ int if_bytes; /* bytes in if_u1 */
+ int if_real_bytes; /* bytes allocated in if_u1 */
+ struct xfs_btree_block *if_broot; /* file's incore btree root */
+ short if_broot_bytes; /* bytes allocated for root */
+ unsigned char if_flags; /* per-fork flags */
+ unsigned char if_ext_max; /* max # of extent records */
+ union {
+ xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
+ xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
+ char *if_data; /* inline file data */
+ } if_u1;
+ union {
+ xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
+ /* very small file extents */
+ char if_inline_data[XFS_INLINE_DATA];
+ /* very small file data */
+ xfs_dev_t if_rdev; /* dev number if special */
+ uuid_t if_uuid; /* mount point value */
+ } if_u2;
+} xfs_ifork_t;
+
+/*
+ * Inode location information. Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
+ */
+struct xfs_imap {
+ xfs_daddr_t im_blkno; /* starting BB of inode chunk */
+ ushort im_len; /* length in BBs of inode chunk */
+ ushort im_boffset; /* inode offset in block in bytes */
+};
+
+/*
+ * This is the xfs in-core inode structure.
+ * Most of the on-disk inode is embedded in the i_d field.
+ *
+ * The extent pointers/inline file space, however, are managed
+ * separately. The memory for this information is pointed to by
+ * the if_u1 unions depending on the type of the data.
+ * This is used to linearize the array of extents for fast in-core
+ * access. This is used until the file's number of extents
+ * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
+ * are accessed through the buffer cache.
+ *
+ * Other state kept in the in-core inode is used for identification,
+ * locking, transactional updating, etc of the inode.
+ *
+ * Generally, we do not want to hold the i_rlock while holding the
+ * i_ilock. Hierarchy is i_iolock followed by i_rlock.
+ *
+ * xfs_iptr_t contains all the inode fields up to and including the
+ * i_mnext and i_mprev fields, it is used as a marker in the inode
+ * chain off the mount structure by xfs_sync calls.
+ */
+
+typedef struct xfs_ictimestamp {
+ __int32_t t_sec; /* timestamp seconds */
+ __int32_t t_nsec; /* timestamp nanoseconds */
+} xfs_ictimestamp_t;
+
+/*
+ * NOTE: This structure must be kept identical to struct xfs_dinode
+ * in xfs_dinode.h except for the endianness annotations.
+ */
+typedef struct xfs_icdinode {
+ __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
+ __uint16_t di_mode; /* mode and type of file */
+ __int8_t di_version; /* inode version */
+ __int8_t di_format; /* format of di_c data */
+ __uint16_t di_onlink; /* old number of links to file */
+ __uint32_t di_uid; /* owner's user id */
+ __uint32_t di_gid; /* owner's group id */
+ __uint32_t di_nlink; /* number of links to file */
+ __uint16_t di_projid_lo; /* lower part of owner's project id */
+ __uint16_t di_projid_hi; /* higher part of owner's project id */
+ __uint8_t di_pad[6]; /* unused, zeroed space */
+ __uint16_t di_flushiter; /* incremented on flush */
+ xfs_ictimestamp_t di_atime; /* time last accessed */
+ xfs_ictimestamp_t di_mtime; /* time last modified */
+ xfs_ictimestamp_t di_ctime; /* time created/inode modified */
+ xfs_fsize_t di_size; /* number of bytes in file */
+ xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */
+ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
+ xfs_extnum_t di_nextents; /* number of extents in data fork */
+ xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
+ __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
+ __int8_t di_aformat; /* format of attr fork's data */
+ __uint32_t di_dmevmask; /* DMIG event mask */
+ __uint16_t di_dmstate; /* DMIG state info */
+ __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
+ __uint32_t di_gen; /* generation number */
+} xfs_icdinode_t;
+
+/*
+ * Flags for xfs_ichgtime().
+ */
+#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
+#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define XFS_IFINLINE 0x01 /* Inline data is read in */
+#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
+#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
+#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
+
+/*
+ * Fork handling.
+ */
+
+#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
+
+#define XFS_IFORK_PTR(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ &(ip)->i_df : \
+ (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+ (XFS_IFORK_Q(ip) ? \
+ XFS_IFORK_BOFF(ip) : \
+ XFS_LITINO((ip)->i_mount))
+#define XFS_IFORK_ASIZE(ip) \
+ (XFS_IFORK_Q(ip) ? \
+ XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
+ 0)
+#define XFS_IFORK_SIZE(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ XFS_IFORK_DSIZE(ip) : \
+ XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ (ip)->i_d.di_format : \
+ (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+ ((w) == XFS_DATA_FORK ? \
+ ((ip)->i_d.di_format = (n)) : \
+ ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ (ip)->i_d.di_nextents : \
+ (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+ ((w) == XFS_DATA_FORK ? \
+ ((ip)->i_d.di_nextents = (n)) : \
+ ((ip)->i_d.di_anextents = (n)))
+
+
+
+#ifdef __KERNEL__
+
+struct bhv_desc;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+
+typedef struct dm_attrs_s {
+ __uint32_t da_dmevmask; /* DMIG event mask */
+ __uint16_t da_dmstate; /* DMIG state info */
+ __uint16_t da_pad; /* DMIG extra padding */
+} dm_attrs_t;
+
+typedef struct xfs_inode {
+ /* Inode linking and identification information. */
+ struct xfs_mount *i_mount; /* fs mount struct ptr */
+ struct xfs_dquot *i_udquot; /* user dquot */
+ struct xfs_dquot *i_gdquot; /* group dquot */
+
+ /* Inode location stuff */
+ xfs_ino_t i_ino; /* inode number (agno/agino)*/
+ struct xfs_imap i_imap; /* location for xfs_imap() */
+
+ /* Extent information. */
+ xfs_ifork_t *i_afp; /* attribute fork pointer */
+ xfs_ifork_t i_df; /* data fork */
+
+ /* Transaction and locking information. */
+ struct xfs_trans *i_transp; /* ptr to owning transaction*/
+ struct xfs_inode_log_item *i_itemp; /* logging information */
+ mrlock_t i_lock; /* inode lock */
+ mrlock_t i_iolock; /* inode IO lock */
+ struct completion i_flush; /* inode flush completion q */
+ atomic_t i_pincount; /* inode pin count */
+ wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
+ spinlock_t i_flags_lock; /* inode i_flags lock */
+ /* Miscellaneous state. */
+ unsigned short i_flags; /* see defined flags below */
+ unsigned char i_update_core; /* timestamps/size is dirty */
+ unsigned int i_delayed_blks; /* count of delay alloc blks */
+
+ xfs_icdinode_t i_d; /* most of ondisk inode */
+
+ xfs_fsize_t i_size; /* in-memory size */
+ xfs_fsize_t i_new_size; /* size when write completes */
+ atomic_t i_iocount; /* outstanding I/O count */
+
+ /* VFS inode */
+ struct inode i_vnode; /* embedded VFS inode */
+} xfs_inode_t;
+
+#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
+ (ip)->i_size : (ip)->i_d.di_size;
+
+/* Convert from vfs inode to xfs inode */
+static inline struct xfs_inode *XFS_I(struct inode *inode)
+{
+ return container_of(inode, struct xfs_inode, i_vnode);
+}
+
+/* convert from xfs inode to vfs inode */
+static inline struct inode *VFS_I(struct xfs_inode *ip)
+{
+ return &ip->i_vnode;
+}
+
+/*
+ * i_flags helper functions
+ */
+static inline void
+__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+{
+ ip->i_flags |= flags;
+}
+
+static inline void
+xfs_iflags_set(xfs_inode_t *ip, unsigned short flags)
+{
+ spin_lock(&ip->i_flags_lock);
+ __xfs_iflags_set(ip, flags);
+ spin_unlock(&ip->i_flags_lock);
+}
+
+static inline void
+xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags)
+{
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags &= ~flags;
+ spin_unlock(&ip->i_flags_lock);
+}
+
+static inline int
+__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+{
+ return (ip->i_flags & flags);
+}
+
+static inline int
+xfs_iflags_test(xfs_inode_t *ip, unsigned short flags)
+{
+ int ret;
+ spin_lock(&ip->i_flags_lock);
+ ret = __xfs_iflags_test(ip, flags);
+ spin_unlock(&ip->i_flags_lock);
+ return ret;
+}
+
+static inline int
+xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
+{
+ int ret;
+
+ spin_lock(&ip->i_flags_lock);
+ ret = ip->i_flags & flags;
+ if (ret)
+ ip->i_flags &= ~flags;
+ spin_unlock(&ip->i_flags_lock);
+ return ret;
+}
+
+/*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was chosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline prid_t
+xfs_get_projid(struct xfs_inode *ip)
+{
+ return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
+}
+
+static inline void
+xfs_set_projid(struct xfs_inode *ip,
+ prid_t projid)
+{
+ ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16);
+ ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
+}
+
+/*
+ * Manage the i_flush queue embedded in the inode. This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
+ */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+ wait_for_completion(&ip->i_flush);
+}
+
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
+{
+ return try_wait_for_completion(&ip->i_flush);
+}
+
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+ complete(&ip->i_flush);
+}
+
+/*
+ * In-core inode flags.
+ */
+#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
+#define XFS_ISTALE 0x0002 /* inode has been staled */
+#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_INEW 0x0008 /* inode has just been allocated */
+#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
+#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
+
+/*
+ * Per-lifetime flags need to be reset when re-using a reclaimable inode during
+ * inode lookup. Thi prevents unintended behaviour on the new inode from
+ * ocurring.
+ */
+#define XFS_IRECLAIM_RESET_FLAGS \
+ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \
+ XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
+ XFS_IFILESTREAM);
+
+/*
+ * Flags for inode locking.
+ * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
+ * 1<<16 - 1<<32-1 -- lockdep annotation (integers)
+ */
+#define XFS_IOLOCK_EXCL (1<<0)
+#define XFS_IOLOCK_SHARED (1<<1)
+#define XFS_ILOCK_EXCL (1<<2)
+#define XFS_ILOCK_SHARED (1<<3)
+#define XFS_IUNLOCK_NONOTIFY (1<<4)
+
+#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
+ | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
+
+#define XFS_LOCK_FLAGS \
+ { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \
+ { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \
+ { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \
+ { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \
+ { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" }
+
+
+/*
+ * Flags for lockdep annotations.
+ *
+ * XFS_LOCK_PARENT - for directory operations that require locking a
+ * parent directory inode and a child entry inode. The parent gets locked
+ * with this flag so it gets a lockdep subclass of 1 and the child entry
+ * lock will have a lockdep subclass of 0.
+ *
+ * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
+ * inodes do not participate in the normal lock order, and thus have their
+ * own subclasses.
+ *
+ * XFS_LOCK_INUMORDER - for locking several inodes at the some time
+ * with xfs_lock_inodes(). This flag is used as the starting subclass
+ * and each subsequent lock acquired will increment the subclass by one.
+ * So the first lock acquired will have a lockdep subclass of 4, the
+ * second lock will have a lockdep subclass of 5, and so on. It is
+ * the responsibility of the class builder to shift this to the correct
+ * portion of the lock_mode lockdep mask.
+ */
+#define XFS_LOCK_PARENT 1
+#define XFS_LOCK_RTBITMAP 2
+#define XFS_LOCK_RTSUM 3
+#define XFS_LOCK_INUMORDER 4
+
+#define XFS_IOLOCK_SHIFT 16
+#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
+
+#define XFS_ILOCK_SHIFT 24
+#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
+
+#define XFS_IOLOCK_DEP_MASK 0x00ff0000
+#define XFS_ILOCK_DEP_MASK 0xff000000
+#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK)
+
+#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
+#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+
+extern struct lock_class_key xfs_iolock_reclaimable;
+
+/*
+ * Flags for xfs_itruncate_start().
+ */
+#define XFS_ITRUNC_DEFINITE 0x1
+#define XFS_ITRUNC_MAYBE 0x2
+
+#define XFS_ITRUNC_FLAGS \
+ { XFS_ITRUNC_DEFINITE, "DEFINITE" }, \
+ { XFS_ITRUNC_MAYBE, "MAYBE" }
+
+/*
+ * For multiple groups support: if S_ISGID bit is set in the parent
+ * directory, group of new file is set to that of the parent, and
+ * new subdirectory gets S_ISGID bit from parent.
+ */
+#define XFS_INHERIT_GID(pip) \
+ (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
+ ((pip)->i_d.di_mode & S_ISGID))
+
+/*
+ * xfs_iget.c prototypes.
+ */
+int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+ uint, uint, xfs_inode_t **);
+void xfs_ilock(xfs_inode_t *, uint);
+int xfs_ilock_nowait(xfs_inode_t *, uint);
+void xfs_iunlock(xfs_inode_t *, uint);
+void xfs_ilock_demote(xfs_inode_t *, uint);
+int xfs_isilocked(xfs_inode_t *, uint);
+uint xfs_ilock_map_shared(xfs_inode_t *);
+void xfs_iunlock_map_shared(xfs_inode_t *, uint);
+void xfs_inode_free(struct xfs_inode *ip);
+
+/*
+ * xfs_inode.c prototypes.
+ */
+int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
+ xfs_nlink_t, xfs_dev_t, prid_t, int,
+ struct xfs_buf **, boolean_t *, xfs_inode_t **);
+
+uint xfs_ip2xflags(struct xfs_inode *);
+uint xfs_dic2xflags(struct xfs_dinode *);
+int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
+ struct xfs_bmap_free *);
+int xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
+int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
+ xfs_fsize_t, int, int);
+int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
+
+void xfs_iext_realloc(xfs_inode_t *, int, int);
+void xfs_iunpin_wait(xfs_inode_t *);
+int xfs_iflush(xfs_inode_t *, uint);
+void xfs_promote_inode(struct xfs_inode *);
+void xfs_lock_inodes(xfs_inode_t **, int, uint);
+void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
+
+void xfs_synchronize_times(xfs_inode_t *);
+void xfs_mark_inode_dirty(xfs_inode_t *);
+void xfs_mark_inode_dirty_sync(xfs_inode_t *);
+
+#define IHOLD(ip) \
+do { \
+ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
+ ihold(VFS_I(ip)); \
+ trace_xfs_ihold(ip, _THIS_IP_); \
+} while (0)
+
+#define IRELE(ip) \
+do { \
+ trace_xfs_irele(ip, _THIS_IP_); \
+ iput(VFS_I(ip)); \
+} while (0)
+
+#endif /* __KERNEL__ */
+
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE 0x1
+#define XFS_IGET_UNTRUSTED 0x2
+
+int xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
+ xfs_ino_t, struct xfs_dinode **,
+ struct xfs_buf **, int *, uint);
+int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+ struct xfs_inode *, struct xfs_dinode **,
+ struct xfs_buf **, uint);
+int xfs_iread(struct xfs_mount *, struct xfs_trans *,
+ struct xfs_inode *, uint);
+void xfs_dinode_to_disk(struct xfs_dinode *,
+ struct xfs_icdinode *);
+void xfs_idestroy_fork(struct xfs_inode *, int);
+void xfs_idata_realloc(struct xfs_inode *, int, int);
+void xfs_iroot_realloc(struct xfs_inode *, int, int);
+int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
+
+xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
+void xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t,
+ xfs_bmbt_irec_t *, int);
+void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int);
+void xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int);
+void xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int);
+void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
+void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
+void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
+void xfs_iext_realloc_direct(xfs_ifork_t *, int);
+void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
+void xfs_iext_inline_to_direct(xfs_ifork_t *, int);
+void xfs_iext_destroy(xfs_ifork_t *);
+xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *);
+xfs_ext_irec_t *xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *);
+xfs_ext_irec_t *xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int);
+void xfs_iext_irec_init(xfs_ifork_t *);
+xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int);
+void xfs_iext_irec_remove(xfs_ifork_t *, int);
+void xfs_iext_irec_compact(xfs_ifork_t *);
+void xfs_iext_irec_compact_pages(xfs_ifork_t *);
+void xfs_iext_irec_compact_full(xfs_ifork_t *);
+void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
+
+#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
+
+#ifdef DEBUG
+void xfs_isize_check(struct xfs_mount *, struct xfs_inode *,
+ xfs_fsize_t);
+#else /* DEBUG */
+#define xfs_isize_check(mp, ip, isize)
+#endif /* DEBUG */
+
+#if defined(DEBUG)
+void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+#else
+#define xfs_inobp_check(mp, bp)
+#endif /* DEBUG */
+
+extern struct kmem_zone *xfs_ifork_zone;
+extern struct kmem_zone *xfs_inode_zone;
+extern struct kmem_zone *xfs_ili_zone;
+
+#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
new file mode 100644
index 00000000..391044c6
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.c
@@ -0,0 +1,1060 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+
+
+kmem_zone_t *xfs_ili_zone; /* inode log item zone */
+
+static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_inode_log_item, ili_item);
+}
+
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We need one iovec for the inode log format structure, one for the
+ * inode core, and possibly one for the inode data/extents/b-tree root
+ * and one for the inode attribute data/extents/b-tree root.
+ */
+STATIC uint
+xfs_inode_item_size(
+ struct xfs_log_item *lip)
+{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+ uint nvecs = 2;
+
+ /*
+ * Only log the data/extents/b-tree root if there is something
+ * left to log.
+ */
+ iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
+
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ iip->ili_format.ilf_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID);
+ if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
+ (ip->i_d.di_nextents > 0) &&
+ (ip->i_df.if_bytes > 0)) {
+ ASSERT(ip->i_df.if_u1.if_extents != NULL);
+ nvecs++;
+ } else {
+ iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
+ }
+ break;
+
+ case XFS_DINODE_FMT_BTREE:
+ ASSERT(ip->i_df.if_ext_max ==
+ XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+ iip->ili_format.ilf_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID);
+ if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
+ (ip->i_df.if_broot_bytes > 0)) {
+ ASSERT(ip->i_df.if_broot != NULL);
+ nvecs++;
+ } else {
+ ASSERT(!(iip->ili_format.ilf_fields &
+ XFS_ILOG_DBROOT));
+#ifdef XFS_TRANS_DEBUG
+ if (iip->ili_root_size > 0) {
+ ASSERT(iip->ili_root_size ==
+ ip->i_df.if_broot_bytes);
+ ASSERT(memcmp(iip->ili_orig_root,
+ ip->i_df.if_broot,
+ iip->ili_root_size) == 0);
+ } else {
+ ASSERT(ip->i_df.if_broot_bytes == 0);
+ }
+#endif
+ iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
+ }
+ break;
+
+ case XFS_DINODE_FMT_LOCAL:
+ iip->ili_format.ilf_fields &=
+ ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID);
+ if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
+ (ip->i_df.if_bytes > 0)) {
+ ASSERT(ip->i_df.if_u1.if_data != NULL);
+ ASSERT(ip->i_d.di_size > 0);
+ nvecs++;
+ } else {
+ iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
+ }
+ break;
+
+ case XFS_DINODE_FMT_DEV:
+ iip->ili_format.ilf_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEXT | XFS_ILOG_UUID);
+ break;
+
+ case XFS_DINODE_FMT_UUID:
+ iip->ili_format.ilf_fields &=
+ ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEXT | XFS_ILOG_DEV);
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ /*
+ * If there are no attributes associated with this file,
+ * then there cannot be anything more to log.
+ * Clear all attribute-related log flags.
+ */
+ if (!XFS_IFORK_Q(ip)) {
+ iip->ili_format.ilf_fields &=
+ ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+ return nvecs;
+ }
+
+ /*
+ * Log any necessary attribute data.
+ */
+ switch (ip->i_d.di_aformat) {
+ case XFS_DINODE_FMT_EXTENTS:
+ iip->ili_format.ilf_fields &=
+ ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+ if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&
+ (ip->i_d.di_anextents > 0) &&
+ (ip->i_afp->if_bytes > 0)) {
+ ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+ nvecs++;
+ } else {
+ iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
+ }
+ break;
+
+ case XFS_DINODE_FMT_BTREE:
+ iip->ili_format.ilf_fields &=
+ ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
+ if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
+ (ip->i_afp->if_broot_bytes > 0)) {
+ ASSERT(ip->i_afp->if_broot != NULL);
+ nvecs++;
+ } else {
+ iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
+ }
+ break;
+
+ case XFS_DINODE_FMT_LOCAL:
+ iip->ili_format.ilf_fields &=
+ ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+ if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
+ (ip->i_afp->if_bytes > 0)) {
+ ASSERT(ip->i_afp->if_u1.if_data != NULL);
+ nvecs++;
+ } else {
+ iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
+ }
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ return nvecs;
+}
+
+/*
+ * xfs_inode_item_format_extents - convert in-core extents to on-disk form
+ *
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode. In this case, we
+ * need to do this conversion before we write the extents into the log. Because
+ * we don't have the disk inode to write into here, we allocate a buffer and
+ * format the extents into it via xfs_iextents_copy(). We free the buffer in
+ * the unlock routine after the copy for the log has been made.
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only log on-disk extents
+ * here, so always use the physical fork size to determine the size of the
+ * buffer we need to allocate.
+ */
+STATIC void
+xfs_inode_item_format_extents(
+ struct xfs_inode *ip,
+ struct xfs_log_iovec *vecp,
+ int whichfork,
+ int type)
+{
+ xfs_bmbt_rec_t *ext_buffer;
+
+ ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
+ if (whichfork == XFS_DATA_FORK)
+ ip->i_itemp->ili_extents_buf = ext_buffer;
+ else
+ ip->i_itemp->ili_aextents_buf = ext_buffer;
+
+ vecp->i_addr = ext_buffer;
+ vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
+ vecp->i_type = type;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode log item. It fills the first item with an inode
+ * log format structure, the second with the on-disk inode structure,
+ * and a possible third and/or fourth with the inode data/extents/b-tree
+ * root and inode attributes data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_iovec *vecp)
+{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+ uint nvecs;
+ size_t data_bytes;
+ xfs_mount_t *mp;
+
+ vecp->i_addr = &iip->ili_format;
+ vecp->i_len = sizeof(xfs_inode_log_format_t);
+ vecp->i_type = XLOG_REG_TYPE_IFORMAT;
+ vecp++;
+ nvecs = 1;
+
+ /*
+ * Clear i_update_core if the timestamps (or any other
+ * non-transactional modification) need flushing/logging
+ * and we're about to log them with the rest of the core.
+ *
+ * This is the same logic as xfs_iflush() but this code can't
+ * run at the same time as xfs_iflush because we're in commit
+ * processing here and so we have the inode lock held in
+ * exclusive mode. Although it doesn't really matter
+ * for the timestamps if both routines were to grab the
+ * timestamps or not. That would be ok.
+ *
+ * We clear i_update_core before copying out the data.
+ * This is for coordination with our timestamp updates
+ * that don't hold the inode lock. They will always
+ * update the timestamps BEFORE setting i_update_core,
+ * so if we clear i_update_core after they set it we
+ * are guaranteed to see their updates to the timestamps
+ * either here. Likewise, if they set it after we clear it
+ * here, we'll see it either on the next commit of this
+ * inode or the next time the inode gets flushed via
+ * xfs_iflush(). This depends on strongly ordered memory
+ * semantics, but we have that. We use the SYNCHRONIZE
+ * macro to make sure that the compiler does not reorder
+ * the i_update_core access below the data copy below.
+ */
+ if (ip->i_update_core) {
+ ip->i_update_core = 0;
+ SYNCHRONIZE();
+ }
+
+ /*
+ * Make sure to get the latest timestamps from the Linux inode.
+ */
+ xfs_synchronize_times(ip);
+
+ vecp->i_addr = &ip->i_d;
+ vecp->i_len = sizeof(struct xfs_icdinode);
+ vecp->i_type = XLOG_REG_TYPE_ICORE;
+ vecp++;
+ nvecs++;
+ iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
+
+ /*
+ * If this is really an old format inode, then we need to
+ * log it as such. This means that we have to copy the link
+ * count from the new field to the old. We don't have to worry
+ * about the new fields, because nothing trusts them as long as
+ * the old inode version number is there. If the superblock already
+ * has a new version number, then we don't bother converting back.
+ */
+ mp = ip->i_mount;
+ ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
+ if (ip->i_d.di_version == 1) {
+ if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+ /*
+ * Convert it back.
+ */
+ ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+ ip->i_d.di_onlink = ip->i_d.di_nlink;
+ } else {
+ /*
+ * The superblock version has already been bumped,
+ * so just make the conversion to the new inode
+ * format permanent.
+ */
+ ip->i_d.di_version = 2;
+ ip->i_d.di_onlink = 0;
+ memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+ }
+ }
+
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ ASSERT(!(iip->ili_format.ilf_fields &
+ (XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID)));
+ if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {
+ ASSERT(ip->i_df.if_bytes > 0);
+ ASSERT(ip->i_df.if_u1.if_extents != NULL);
+ ASSERT(ip->i_d.di_nextents > 0);
+ ASSERT(iip->ili_extents_buf == NULL);
+ ASSERT((ip->i_df.if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t)) > 0);
+#ifdef XFS_NATIVE_HOST
+ if (ip->i_d.di_nextents == ip->i_df.if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t)) {
+ /*
+ * There are no delayed allocation
+ * extents, so just point to the
+ * real extents array.
+ */
+ vecp->i_addr = ip->i_df.if_u1.if_extents;
+ vecp->i_len = ip->i_df.if_bytes;
+ vecp->i_type = XLOG_REG_TYPE_IEXT;
+ } else
+#endif
+ {
+ xfs_inode_item_format_extents(ip, vecp,
+ XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
+ }
+ ASSERT(vecp->i_len <= ip->i_df.if_bytes);
+ iip->ili_format.ilf_dsize = vecp->i_len;
+ vecp++;
+ nvecs++;
+ }
+ break;
+
+ case XFS_DINODE_FMT_BTREE:
+ ASSERT(!(iip->ili_format.ilf_fields &
+ (XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID)));
+ if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
+ ASSERT(ip->i_df.if_broot_bytes > 0);
+ ASSERT(ip->i_df.if_broot != NULL);
+ vecp->i_addr = ip->i_df.if_broot;
+ vecp->i_len = ip->i_df.if_broot_bytes;
+ vecp->i_type = XLOG_REG_TYPE_IBROOT;
+ vecp++;
+ nvecs++;
+ iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+ }
+ break;
+
+ case XFS_DINODE_FMT_LOCAL:
+ ASSERT(!(iip->ili_format.ilf_fields &
+ (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+ XFS_ILOG_DEV | XFS_ILOG_UUID)));
+ if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {
+ ASSERT(ip->i_df.if_bytes > 0);
+ ASSERT(ip->i_df.if_u1.if_data != NULL);
+ ASSERT(ip->i_d.di_size > 0);
+
+ vecp->i_addr = ip->i_df.if_u1.if_data;
+ /*
+ * Round i_bytes up to a word boundary.
+ * The underlying memory is guaranteed to
+ * to be there by xfs_idata_realloc().
+ */
+ data_bytes = roundup(ip->i_df.if_bytes, 4);
+ ASSERT((ip->i_df.if_real_bytes == 0) ||
+ (ip->i_df.if_real_bytes == data_bytes));
+ vecp->i_len = (int)data_bytes;
+ vecp->i_type = XLOG_REG_TYPE_ILOCAL;
+ vecp++;
+ nvecs++;
+ iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+ }
+ break;
+
+ case XFS_DINODE_FMT_DEV:
+ ASSERT(!(iip->ili_format.ilf_fields &
+ (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+ XFS_ILOG_DDATA | XFS_ILOG_UUID)));
+ if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+ iip->ili_format.ilf_u.ilfu_rdev =
+ ip->i_df.if_u2.if_rdev;
+ }
+ break;
+
+ case XFS_DINODE_FMT_UUID:
+ ASSERT(!(iip->ili_format.ilf_fields &
+ (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+ XFS_ILOG_DDATA | XFS_ILOG_DEV)));
+ if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+ iip->ili_format.ilf_u.ilfu_uuid =
+ ip->i_df.if_u2.if_uuid;
+ }
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ /*
+ * If there are no attributes associated with the file,
+ * then we're done.
+ * Assert that no attribute-related log flags are set.
+ */
+ if (!XFS_IFORK_Q(ip)) {
+ ASSERT(nvecs == lip->li_desc->lid_size);
+ iip->ili_format.ilf_size = nvecs;
+ ASSERT(!(iip->ili_format.ilf_fields &
+ (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+ return;
+ }
+
+ switch (ip->i_d.di_aformat) {
+ case XFS_DINODE_FMT_EXTENTS:
+ ASSERT(!(iip->ili_format.ilf_fields &
+ (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
+ if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
+#ifdef DEBUG
+ int nrecs = ip->i_afp->if_bytes /
+ (uint)sizeof(xfs_bmbt_rec_t);
+ ASSERT(nrecs > 0);
+ ASSERT(nrecs == ip->i_d.di_anextents);
+ ASSERT(ip->i_afp->if_bytes > 0);
+ ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+ ASSERT(ip->i_d.di_anextents > 0);
+#endif
+#ifdef XFS_NATIVE_HOST
+ /*
+ * There are not delayed allocation extents
+ * for attributes, so just point at the array.
+ */
+ vecp->i_addr = ip->i_afp->if_u1.if_extents;
+ vecp->i_len = ip->i_afp->if_bytes;
+ vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
+#else
+ ASSERT(iip->ili_aextents_buf == NULL);
+ xfs_inode_item_format_extents(ip, vecp,
+ XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
+#endif
+ iip->ili_format.ilf_asize = vecp->i_len;
+ vecp++;
+ nvecs++;
+ }
+ break;
+
+ case XFS_DINODE_FMT_BTREE:
+ ASSERT(!(iip->ili_format.ilf_fields &
+ (XFS_ILOG_ADATA | XFS_ILOG_AEXT)));
+ if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
+ ASSERT(ip->i_afp->if_broot_bytes > 0);
+ ASSERT(ip->i_afp->if_broot != NULL);
+ vecp->i_addr = ip->i_afp->if_broot;
+ vecp->i_len = ip->i_afp->if_broot_bytes;
+ vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
+ vecp++;
+ nvecs++;
+ iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+ }
+ break;
+
+ case XFS_DINODE_FMT_LOCAL:
+ ASSERT(!(iip->ili_format.ilf_fields &
+ (XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+ if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {
+ ASSERT(ip->i_afp->if_bytes > 0);
+ ASSERT(ip->i_afp->if_u1.if_data != NULL);
+
+ vecp->i_addr = ip->i_afp->if_u1.if_data;
+ /*
+ * Round i_bytes up to a word boundary.
+ * The underlying memory is guaranteed to
+ * to be there by xfs_idata_realloc().
+ */
+ data_bytes = roundup(ip->i_afp->if_bytes, 4);
+ ASSERT((ip->i_afp->if_real_bytes == 0) ||
+ (ip->i_afp->if_real_bytes == data_bytes));
+ vecp->i_len = (int)data_bytes;
+ vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
+ vecp++;
+ nvecs++;
+ iip->ili_format.ilf_asize = (unsigned)data_bytes;
+ }
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ ASSERT(nvecs == lip->li_desc->lid_size);
+ iip->ili_format.ilf_size = nvecs;
+}
+
+
+/*
+ * This is called to pin the inode associated with the inode log
+ * item in memory so it cannot be written out.
+ */
+STATIC void
+xfs_inode_item_pin(
+ struct xfs_log_item *lip)
+{
+ struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ trace_xfs_inode_pin(ip, _RET_IP_);
+ atomic_inc(&ip->i_pincount);
+}
+
+
+/*
+ * This is called to unpin the inode associated with the inode log
+ * item which was previously pinned with a call to xfs_inode_item_pin().
+ *
+ * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
+ */
+STATIC void
+xfs_inode_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
+
+ trace_xfs_inode_unpin(ip, _RET_IP_);
+ ASSERT(atomic_read(&ip->i_pincount) > 0);
+ if (atomic_dec_and_test(&ip->i_pincount))
+ wake_up(&ip->i_ipin_wait);
+}
+
+/*
+ * This is called to attempt to lock the inode associated with this
+ * inode log item, in preparation for the push routine which does the actual
+ * iflush. Don't sleep on the inode lock or the flush lock.
+ *
+ * If the flush lock is already held, indicating that the inode has
+ * been or is in the process of being flushed, then (ideally) we'd like to
+ * see if the inode's buffer is still incore, and if so give it a nudge.
+ * We delay doing so until the pushbuf routine, though, to avoid holding
+ * the AIL lock across a call to the blackhole which is the buffer cache.
+ * Also we don't want to sleep in any device strategy routines, which can happen
+ * if we do the subsequent bawrite in here.
+ */
+STATIC uint
+xfs_inode_item_trylock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+
+ if (xfs_ipincount(ip) > 0)
+ return XFS_ITEM_PINNED;
+
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+ return XFS_ITEM_LOCKED;
+
+ if (!xfs_iflock_nowait(ip)) {
+ /*
+ * inode has already been flushed to the backing buffer,
+ * leave it locked in shared mode, pushbuf routine will
+ * unlock it.
+ */
+ return XFS_ITEM_PUSHBUF;
+ }
+
+ /* Stale items should force out the iclog */
+ if (ip->i_flags & XFS_ISTALE) {
+ xfs_ifunlock(ip);
+ /*
+ * we hold the AIL lock - notify the unlock routine of this
+ * so it doesn't try to get the lock again.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
+ return XFS_ITEM_PINNED;
+ }
+
+#ifdef DEBUG
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ ASSERT(iip->ili_format.ilf_fields != 0);
+ ASSERT(iip->ili_logged == 0);
+ ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+ }
+#endif
+ return XFS_ITEM_SUCCESS;
+}
+
+/*
+ * Unlock the inode associated with the inode log item.
+ * Clear the fields of the inode and inode log item that
+ * are specific to the current transaction. If the
+ * hold flags is set, do not unlock the inode.
+ */
+STATIC void
+xfs_inode_item_unlock(
+ struct xfs_log_item *lip)
+{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+ unsigned short lock_flags;
+
+ ASSERT(iip->ili_inode->i_itemp != NULL);
+ ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
+
+ /*
+ * Clear the transaction pointer in the inode.
+ */
+ ip->i_transp = NULL;
+
+ /*
+ * If the inode needed a separate buffer with which to log
+ * its extents, then free it now.
+ */
+ if (iip->ili_extents_buf != NULL) {
+ ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
+ ASSERT(ip->i_d.di_nextents > 0);
+ ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
+ ASSERT(ip->i_df.if_bytes > 0);
+ kmem_free(iip->ili_extents_buf);
+ iip->ili_extents_buf = NULL;
+ }
+ if (iip->ili_aextents_buf != NULL) {
+ ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
+ ASSERT(ip->i_d.di_anextents > 0);
+ ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
+ ASSERT(ip->i_afp->if_bytes > 0);
+ kmem_free(iip->ili_aextents_buf);
+ iip->ili_aextents_buf = NULL;
+ }
+
+ lock_flags = iip->ili_lock_flags;
+ iip->ili_lock_flags = 0;
+ if (lock_flags) {
+ xfs_iunlock(iip->ili_inode, lock_flags);
+ IRELE(iip->ili_inode);
+ }
+}
+
+/*
+ * This is called to find out where the oldest active copy of the inode log
+ * item in the on disk log resides now that the last log write of it completed
+ * at the given lsn. Since we always re-log all dirty data in an inode, the
+ * latest copy in the on disk log is the only one that matters. Therefore,
+ * simply return the given lsn.
+ *
+ * If the inode has been marked stale because the cluster is being freed, we
+ * don't want to (re-)insert this inode into the AIL. There is a race condition
+ * where the cluster buffer may be unpinned before the inode is inserted into
+ * the AIL during transaction committed processing. If the buffer is unpinned
+ * before the inode item has been committed and inserted, then it is possible
+ * for the buffer to be written and IO completes before the inode is inserted
+ * into the AIL. In that case, we'd be inserting a clean, stale inode into the
+ * AIL which will never get removed. It will, however, get reclaimed which
+ * triggers an assert in xfs_inode_free() complaining about freein an inode
+ * still in the AIL.
+ *
+ * To avoid this, just unpin the inode directly and return a LSN of -1 so the
+ * transaction committed code knows that it does not need to do any further
+ * processing on the item.
+ */
+STATIC xfs_lsn_t
+xfs_inode_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+
+ if (xfs_iflags_test(ip, XFS_ISTALE)) {
+ xfs_inode_item_unpin(lip, 0);
+ return -1;
+ }
+ return lsn;
+}
+
+/*
+ * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
+ * failed to get the inode flush lock but did get the inode locked SHARED.
+ * Here we're trying to see if the inode buffer is incore, and if so whether it's
+ * marked delayed write. If that's the case, we'll promote it and that will
+ * allow the caller to write the buffer by triggering the xfsbufd to run.
+ */
+STATIC bool
+xfs_inode_item_pushbuf(
+ struct xfs_log_item *lip)
+{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+ struct xfs_buf *bp;
+ bool ret = true;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
+
+ /*
+ * If a flush is not in progress anymore, chances are that the
+ * inode was taken off the AIL. So, just get out.
+ */
+ if (completion_done(&ip->i_flush) ||
+ !(lip->li_flags & XFS_LI_IN_AIL)) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return true;
+ }
+
+ bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
+ iip->ili_format.ilf_len, XBF_TRYLOCK);
+
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (!bp)
+ return true;
+ if (XFS_BUF_ISDELAYWRITE(bp))
+ xfs_buf_delwri_promote(bp);
+ if (XFS_BUF_ISPINNED(bp))
+ ret = false;
+ xfs_buf_relse(bp);
+ return ret;
+}
+
+/*
+ * This is called to asynchronously write the inode associated with this
+ * inode log item out to disk. The inode will already have been locked by
+ * a successful call to xfs_inode_item_trylock().
+ */
+STATIC void
+xfs_inode_item_push(
+ struct xfs_log_item *lip)
+{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
+ ASSERT(!completion_done(&ip->i_flush));
+
+ /*
+ * Since we were able to lock the inode's flush lock and
+ * we found it on the AIL, the inode must be dirty. This
+ * is because the inode is removed from the AIL while still
+ * holding the flush lock in xfs_iflush_done(). Thus, if
+ * we found it in the AIL and were able to obtain the flush
+ * lock without sleeping, then there must not have been
+ * anyone in the process of flushing the inode.
+ */
+ ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
+ iip->ili_format.ilf_fields != 0);
+
+ /*
+ * Push the inode to it's backing buffer. This will not remove the
+ * inode from the AIL - a further push will be required to trigger a
+ * buffer push. However, this allows all the dirty inodes to be pushed
+ * to the buffer before it is pushed to disk. The buffer IO completion
+ * will pull the inode from the AIL, mark it clean and unlock the flush
+ * lock.
+ */
+ (void) xfs_iflush(ip, SYNC_TRYLOCK);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+}
+
+/*
+ * XXX rcc - this one really has to do something. Probably needs
+ * to stamp in a new field in the incore inode.
+ */
+STATIC void
+xfs_inode_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ INODE_ITEM(lip)->ili_last_lsn = lsn;
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_inode_item_ops = {
+ .iop_size = xfs_inode_item_size,
+ .iop_format = xfs_inode_item_format,
+ .iop_pin = xfs_inode_item_pin,
+ .iop_unpin = xfs_inode_item_unpin,
+ .iop_trylock = xfs_inode_item_trylock,
+ .iop_unlock = xfs_inode_item_unlock,
+ .iop_committed = xfs_inode_item_committed,
+ .iop_push = xfs_inode_item_push,
+ .iop_pushbuf = xfs_inode_item_pushbuf,
+ .iop_committing = xfs_inode_item_committing
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ */
+void
+xfs_inode_item_init(
+ struct xfs_inode *ip,
+ struct xfs_mount *mp)
+{
+ struct xfs_inode_log_item *iip;
+
+ ASSERT(ip->i_itemp == NULL);
+ iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+
+ iip->ili_inode = ip;
+ xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
+ &xfs_inode_item_ops);
+ iip->ili_format.ilf_type = XFS_LI_INODE;
+ iip->ili_format.ilf_ino = ip->i_ino;
+ iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
+ iip->ili_format.ilf_len = ip->i_imap.im_len;
+ iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
+}
+
+/*
+ * Free the inode log item and any memory hanging off of it.
+ */
+void
+xfs_inode_item_destroy(
+ xfs_inode_t *ip)
+{
+#ifdef XFS_TRANS_DEBUG
+ if (ip->i_itemp->ili_root_size != 0) {
+ kmem_free(ip->i_itemp->ili_orig_root);
+ }
+#endif
+ kmem_zone_free(xfs_ili_zone, ip->i_itemp);
+}
+
+
+/*
+ * This is the inode flushing I/O completion routine. It is called
+ * from interrupt level when the buffer containing the inode is
+ * flushed to disk. It is responsible for removing the inode item
+ * from the AIL if it has not been re-logged, and unlocking the inode's
+ * flush lock.
+ *
+ * To reduce AIL lock traffic as much as possible, we scan the buffer log item
+ * list for other inodes that will run this function. We remove them from the
+ * buffer list so we can process all the inode IO completions in one AIL lock
+ * traversal.
+ */
+void
+xfs_iflush_done(
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_inode_log_item *iip;
+ struct xfs_log_item *blip;
+ struct xfs_log_item *next;
+ struct xfs_log_item *prev;
+ struct xfs_ail *ailp = lip->li_ailp;
+ int need_ail = 0;
+
+ /*
+ * Scan the buffer IO completions for other inodes being completed and
+ * attach them to the current inode log item.
+ */
+ blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ prev = NULL;
+ while (blip != NULL) {
+ if (lip->li_cb != xfs_iflush_done) {
+ prev = blip;
+ blip = blip->li_bio_list;
+ continue;
+ }
+
+ /* remove from list */
+ next = blip->li_bio_list;
+ if (!prev) {
+ XFS_BUF_SET_FSPRIVATE(bp, next);
+ } else {
+ prev->li_bio_list = next;
+ }
+
+ /* add to current list */
+ blip->li_bio_list = lip->li_bio_list;
+ lip->li_bio_list = blip;
+
+ /*
+ * while we have the item, do the unlocked check for needing
+ * the AIL lock.
+ */
+ iip = INODE_ITEM(blip);
+ if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+ need_ail++;
+
+ blip = next;
+ }
+
+ /* make sure we capture the state of the initial inode. */
+ iip = INODE_ITEM(lip);
+ if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+ need_ail++;
+
+ /*
+ * We only want to pull the item from the AIL if it is
+ * actually there and its location in the log has not
+ * changed since we started the flush. Thus, we only bother
+ * if the ili_logged flag is set and the inode's lsn has not
+ * changed. First we check the lsn outside
+ * the lock since it's cheaper, and then we recheck while
+ * holding the lock before removing the inode from the AIL.
+ */
+ if (need_ail) {
+ struct xfs_log_item *log_items[need_ail];
+ int i = 0;
+ spin_lock(&ailp->xa_lock);
+ for (blip = lip; blip; blip = blip->li_bio_list) {
+ iip = INODE_ITEM(blip);
+ if (iip->ili_logged &&
+ blip->li_lsn == iip->ili_flush_lsn) {
+ log_items[i++] = blip;
+ }
+ ASSERT(i <= need_ail);
+ }
+ /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
+ xfs_trans_ail_delete_bulk(ailp, log_items, i);
+ }
+
+
+ /*
+ * clean up and unlock the flush lock now we are done. We can clear the
+ * ili_last_fields bits now that we know that the data corresponding to
+ * them is safely on disk.
+ */
+ for (blip = lip; blip; blip = next) {
+ next = blip->li_bio_list;
+ blip->li_bio_list = NULL;
+
+ iip = INODE_ITEM(blip);
+ iip->ili_logged = 0;
+ iip->ili_last_fields = 0;
+ xfs_ifunlock(iip->ili_inode);
+ }
+}
+
+/*
+ * This is the inode flushing abort routine. It is called
+ * from xfs_iflush when the filesystem is shutting down to clean
+ * up the inode state.
+ * It is responsible for removing the inode item
+ * from the AIL if it has not been re-logged, and unlocking the inode's
+ * flush lock.
+ */
+void
+xfs_iflush_abort(
+ xfs_inode_t *ip)
+{
+ xfs_inode_log_item_t *iip = ip->i_itemp;
+
+ if (iip) {
+ struct xfs_ail *ailp = iip->ili_item.li_ailp;
+ if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+ spin_lock(&ailp->xa_lock);
+ if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
+ } else
+ spin_unlock(&ailp->xa_lock);
+ }
+ iip->ili_logged = 0;
+ /*
+ * Clear the ili_last_fields bits now that we know that the
+ * data corresponding to them is safely on disk.
+ */
+ iip->ili_last_fields = 0;
+ /*
+ * Clear the inode logging fields so no more flushes are
+ * attempted.
+ */
+ iip->ili_format.ilf_fields = 0;
+ }
+ /*
+ * Release the inode's flush lock since we're done with it.
+ */
+ xfs_ifunlock(ip);
+}
+
+void
+xfs_istale_done(
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip)
+{
+ xfs_iflush_abort(INODE_ITEM(lip)->ili_inode);
+}
+
+/*
+ * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
+ * (which can have different field alignments) to the native version
+ */
+int
+xfs_inode_item_format_convert(
+ xfs_log_iovec_t *buf,
+ xfs_inode_log_format_t *in_f)
+{
+ if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
+ xfs_inode_log_format_32_t *in_f32 = buf->i_addr;
+
+ in_f->ilf_type = in_f32->ilf_type;
+ in_f->ilf_size = in_f32->ilf_size;
+ in_f->ilf_fields = in_f32->ilf_fields;
+ in_f->ilf_asize = in_f32->ilf_asize;
+ in_f->ilf_dsize = in_f32->ilf_dsize;
+ in_f->ilf_ino = in_f32->ilf_ino;
+ /* copy biggest field of ilf_u */
+ memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+ in_f32->ilf_u.ilfu_uuid.__u_bits,
+ sizeof(uuid_t));
+ in_f->ilf_blkno = in_f32->ilf_blkno;
+ in_f->ilf_len = in_f32->ilf_len;
+ in_f->ilf_boffset = in_f32->ilf_boffset;
+ return 0;
+ } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
+ xfs_inode_log_format_64_t *in_f64 = buf->i_addr;
+
+ in_f->ilf_type = in_f64->ilf_type;
+ in_f->ilf_size = in_f64->ilf_size;
+ in_f->ilf_fields = in_f64->ilf_fields;
+ in_f->ilf_asize = in_f64->ilf_asize;
+ in_f->ilf_dsize = in_f64->ilf_dsize;
+ in_f->ilf_ino = in_f64->ilf_ino;
+ /* copy biggest field of ilf_u */
+ memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+ in_f64->ilf_u.ilfu_uuid.__u_bits,
+ sizeof(uuid_t));
+ in_f->ilf_blkno = in_f64->ilf_blkno;
+ in_f->ilf_len = in_f64->ilf_len;
+ in_f->ilf_boffset = in_f64->ilf_boffset;
+ return 0;
+ }
+ return EFSCORRUPTED;
+}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
new file mode 100644
index 00000000..d3dee61e
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_INODE_ITEM_H__
+#define __XFS_INODE_ITEM_H__
+
+/*
+ * This is the structure used to lay out an inode log item in the
+ * log. The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field. Changes to this structure
+ * must be added on to the end.
+ */
+typedef struct xfs_inode_log_format {
+ __uint16_t ilf_type; /* inode log item type */
+ __uint16_t ilf_size; /* size of this item */
+ __uint32_t ilf_fields; /* flags for fields logged */
+ __uint16_t ilf_asize; /* size of attr d/ext/root */
+ __uint16_t ilf_dsize; /* size of data/ext/root */
+ __uint64_t ilf_ino; /* inode number */
+ union {
+ __uint32_t ilfu_rdev; /* rdev value for dev inode*/
+ uuid_t ilfu_uuid; /* mount point value */
+ } ilf_u;
+ __int64_t ilf_blkno; /* blkno of inode buffer */
+ __int32_t ilf_len; /* len of inode buffer */
+ __int32_t ilf_boffset; /* off of inode in buffer */
+} xfs_inode_log_format_t;
+
+typedef struct xfs_inode_log_format_32 {
+ __uint16_t ilf_type; /* inode log item type */
+ __uint16_t ilf_size; /* size of this item */
+ __uint32_t ilf_fields; /* flags for fields logged */
+ __uint16_t ilf_asize; /* size of attr d/ext/root */
+ __uint16_t ilf_dsize; /* size of data/ext/root */
+ __uint64_t ilf_ino; /* inode number */
+ union {
+ __uint32_t ilfu_rdev; /* rdev value for dev inode*/
+ uuid_t ilfu_uuid; /* mount point value */
+ } ilf_u;
+ __int64_t ilf_blkno; /* blkno of inode buffer */
+ __int32_t ilf_len; /* len of inode buffer */
+ __int32_t ilf_boffset; /* off of inode in buffer */
+} __attribute__((packed)) xfs_inode_log_format_32_t;
+
+typedef struct xfs_inode_log_format_64 {
+ __uint16_t ilf_type; /* inode log item type */
+ __uint16_t ilf_size; /* size of this item */
+ __uint32_t ilf_fields; /* flags for fields logged */
+ __uint16_t ilf_asize; /* size of attr d/ext/root */
+ __uint16_t ilf_dsize; /* size of data/ext/root */
+ __uint32_t ilf_pad; /* pad for 64 bit boundary */
+ __uint64_t ilf_ino; /* inode number */
+ union {
+ __uint32_t ilfu_rdev; /* rdev value for dev inode*/
+ uuid_t ilfu_uuid; /* mount point value */
+ } ilf_u;
+ __int64_t ilf_blkno; /* blkno of inode buffer */
+ __int32_t ilf_len; /* len of inode buffer */
+ __int32_t ilf_boffset; /* off of inode in buffer */
+} xfs_inode_log_format_64_t;
+
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define XFS_ILOG_CORE 0x001 /* log standard inode fields */
+#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */
+#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
+#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
+#define XFS_ILOG_DEV 0x010 /* log the dev field */
+#define XFS_ILOG_UUID 0x020 /* log the uuid field */
+#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
+#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
+#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
+
+#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+ XFS_ILOG_UUID | XFS_ILOG_ADATA | \
+ XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
+
+#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+ XFS_ILOG_DBROOT)
+
+#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+ XFS_ILOG_ABROOT)
+
+#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+ XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+ XFS_ILOG_DEV | XFS_ILOG_UUID | \
+ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+ XFS_ILOG_ABROOT)
+
+static inline int xfs_ilog_fbroot(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+static inline int xfs_ilog_fext(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+static inline int xfs_ilog_fdata(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
+#ifdef __KERNEL__
+
+struct xfs_buf;
+struct xfs_bmbt_rec;
+struct xfs_inode;
+struct xfs_mount;
+
+
+typedef struct xfs_inode_log_item {
+ xfs_log_item_t ili_item; /* common portion */
+ struct xfs_inode *ili_inode; /* inode ptr */
+ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
+ xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
+ unsigned short ili_lock_flags; /* lock flags */
+ unsigned short ili_logged; /* flushed logged data */
+ unsigned int ili_last_fields; /* fields when flushed */
+ struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
+ data exts */
+ struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
+ attr exts */
+#ifdef XFS_TRANS_DEBUG
+ int ili_root_size;
+ char *ili_orig_root;
+#endif
+ xfs_inode_log_format_t ili_format; /* logged structure */
+} xfs_inode_log_item_t;
+
+
+static inline int xfs_inode_clean(xfs_inode_t *ip)
+{
+ return (!ip->i_itemp ||
+ !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+ !ip->i_update_core;
+}
+
+extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
+extern void xfs_inode_item_destroy(struct xfs_inode *);
+extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
+extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
+extern void xfs_iflush_abort(struct xfs_inode *);
+extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
+ xfs_inode_log_format_t *);
+
+#endif /* __KERNEL__ */
+
+#endif /* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
new file mode 100644
index 00000000..b8e4ee4e
--- /dev/null
+++ b/fs/xfs/xfs_inum.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_INUM_H__
+#define __XFS_INUM_H__
+
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+
+typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */
+
+/*
+ * Useful inode bits for this kernel.
+ * Used in some places where having 64-bits in the 32-bit kernels
+ * costs too much.
+ */
+#if XFS_BIG_INUMS
+typedef xfs_ino_t xfs_intino_t;
+#else
+typedef __uint32_t xfs_intino_t;
+#endif
+
+#define NULLFSINO ((xfs_ino_t)-1)
+#define NULLAGINO ((xfs_agino_t)-1)
+
+struct xfs_mount;
+
+#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1)
+#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog
+#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog
+#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log
+#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log
+#define XFS_INO_BITS(mp) \
+ XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
+#define XFS_INO_TO_AGNO(mp,i) \
+ ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#define XFS_INO_TO_AGINO(mp,i) \
+ ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#define XFS_INO_TO_AGBNO(mp,i) \
+ (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+ XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#define XFS_INO_TO_OFFSET(mp,i) \
+ ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define XFS_INO_TO_FSB(mp,i) \
+ XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#define XFS_AGINO_TO_INO(mp,a,i) \
+ (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp))
+#define XFS_AGINO_TO_OFFSET(mp,i) \
+ ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
+ ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+
+#if XFS_BIG_INUMS
+#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
+#else
+#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
+#endif
+#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
+
+#endif /* __XFS_INUM_H__ */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
new file mode 100644
index 00000000..091d82b9
--- /dev/null
+++ b/fs/xfs/xfs_iomap.c
@@ -0,0 +1,748 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_utils.h"
+#include "xfs_iomap.h"
+#include "xfs_trace.h"
+
+
+#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
+ << mp->m_writeio_log)
+#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
+
+STATIC int
+xfs_iomap_eof_align_last_fsb(
+ xfs_mount_t *mp,
+ xfs_inode_t *ip,
+ xfs_extlen_t extsize,
+ xfs_fileoff_t *last_fsb)
+{
+ xfs_fileoff_t new_last_fsb = 0;
+ xfs_extlen_t align;
+ int eof, error;
+
+ if (XFS_IS_REALTIME_INODE(ip))
+ ;
+ /*
+ * If mounted with the "-o swalloc" option, roundup the allocation
+ * request to a stripe width boundary if the file size is >=
+ * stripe width and we are allocating past the allocation eof.
+ */
+ else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
+ (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
+ new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
+ /*
+ * Roundup the allocation request to a stripe unit (m_dalign) boundary
+ * if the file size is >= stripe unit size, and we are allocating past
+ * the allocation eof.
+ */
+ else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
+ new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+
+ /*
+ * Always round up the allocation request to an extent boundary
+ * (when file on a real-time subvolume or has di_extsize hint).
+ */
+ if (extsize) {
+ if (new_last_fsb)
+ align = roundup_64(new_last_fsb, extsize);
+ else
+ align = extsize;
+ new_last_fsb = roundup_64(*last_fsb, align);
+ }
+
+ if (new_last_fsb) {
+ error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
+ if (error)
+ return error;
+ if (eof)
+ *last_fsb = new_last_fsb;
+ }
+ return 0;
+}
+
+STATIC int
+xfs_alert_fsblock_zero(
+ xfs_inode_t *ip,
+ xfs_bmbt_irec_t *imap)
+{
+ xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+ "Access to block zero in inode %llu "
+ "start_block: %llx start_off: %llx "
+ "blkcnt: %llx extent-state: %x\n",
+ (unsigned long long)ip->i_ino,
+ (unsigned long long)imap->br_startblock,
+ (unsigned long long)imap->br_startoff,
+ (unsigned long long)imap->br_blockcount,
+ imap->br_state);
+ return EFSCORRUPTED;
+}
+
+int
+xfs_iomap_write_direct(
+ xfs_inode_t *ip,
+ xfs_off_t offset,
+ size_t count,
+ xfs_bmbt_irec_t *imap,
+ int nmaps)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_filblks_t count_fsb, resaligned;
+ xfs_fsblock_t firstfsb;
+ xfs_extlen_t extsz, temp;
+ int nimaps;
+ int bmapi_flag;
+ int quota_flag;
+ int rt;
+ xfs_trans_t *tp;
+ xfs_bmap_free_t free_list;
+ uint qblocks, resblks, resrtextents;
+ int committed;
+ int error;
+
+ /*
+ * Make sure that the dquots are there. This doesn't hold
+ * the ilock across a disk read.
+ */
+ error = xfs_qm_dqattach_locked(ip, 0);
+ if (error)
+ return XFS_ERROR(error);
+
+ rt = XFS_IS_REALTIME_INODE(ip);
+ extsz = xfs_get_extsz_hint(ip);
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+ if ((offset + count) > ip->i_size) {
+ error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+ if (error)
+ goto error_out;
+ } else {
+ if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
+ last_fsb = MIN(last_fsb, (xfs_fileoff_t)
+ imap->br_blockcount +
+ imap->br_startoff);
+ }
+ count_fsb = last_fsb - offset_fsb;
+ ASSERT(count_fsb > 0);
+
+ resaligned = count_fsb;
+ if (unlikely(extsz)) {
+ if ((temp = do_mod(offset_fsb, extsz)))
+ resaligned += temp;
+ if ((temp = do_mod(resaligned, extsz)))
+ resaligned += extsz - temp;
+ }
+
+ if (unlikely(rt)) {
+ resrtextents = qblocks = resaligned;
+ resrtextents /= mp->m_sb.sb_rextsize;
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ quota_flag = XFS_QMOPT_RES_RTBLKS;
+ } else {
+ resrtextents = 0;
+ resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ quota_flag = XFS_QMOPT_RES_REGBLKS;
+ }
+
+ /*
+ * Allocate and setup the transaction
+ */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+ error = xfs_trans_reserve(tp, resblks,
+ XFS_WRITE_LOG_RES(mp), resrtextents,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+ /*
+ * Check for running out of space, note: need lock to return
+ */
+ if (error)
+ xfs_trans_cancel(tp, 0);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ goto error_out;
+
+ error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+ if (error)
+ goto error1;
+
+ xfs_trans_ijoin(tp, ip);
+
+ bmapi_flag = XFS_BMAPI_WRITE;
+ if (offset < ip->i_size || extsz)
+ bmapi_flag |= XFS_BMAPI_PREALLOC;
+
+ /*
+ * Issue the xfs_bmapi() call to allocate the blocks.
+ *
+ * From this point onwards we overwrite the imap pointer that the
+ * caller gave to us.
+ */
+ xfs_bmap_init(&free_list, &firstfsb);
+ nimaps = 1;
+ error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
+ &firstfsb, 0, imap, &nimaps, &free_list);
+ if (error)
+ goto error0;
+
+ /*
+ * Complete the transaction
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto error0;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error_out;
+
+ /*
+ * Copy any maps to caller's array and return any error.
+ */
+ if (nimaps == 0) {
+ error = ENOSPC;
+ goto error_out;
+ }
+
+ if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
+ error = xfs_alert_fsblock_zero(ip, imap);
+ goto error_out;
+ }
+
+ return 0;
+
+error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+
+error1: /* Just cancel transaction */
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+
+error_out:
+ return XFS_ERROR(error);
+}
+
+/*
+ * If the caller is doing a write at the end of the file, then extend the
+ * allocation out to the file system's write iosize. We clean up any extra
+ * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
+ */
+STATIC int
+xfs_iomap_eof_want_preallocate(
+ xfs_mount_t *mp,
+ xfs_inode_t *ip,
+ xfs_off_t offset,
+ size_t count,
+ xfs_bmbt_irec_t *imap,
+ int nimaps,
+ int *prealloc)
+{
+ xfs_fileoff_t start_fsb;
+ xfs_filblks_t count_fsb;
+ xfs_fsblock_t firstblock;
+ int n, error, imaps;
+ int found_delalloc = 0;
+
+ *prealloc = 0;
+ if ((offset + count) <= ip->i_size)
+ return 0;
+
+ /*
+ * If there are any real blocks past eof, then don't
+ * do any speculative allocation.
+ */
+ start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
+ count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+ while (count_fsb > 0) {
+ imaps = nimaps;
+ firstblock = NULLFSBLOCK;
+ error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
+ &firstblock, 0, imap, &imaps, NULL);
+ if (error)
+ return error;
+ for (n = 0; n < imaps; n++) {
+ if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
+ (imap[n].br_startblock != DELAYSTARTBLOCK))
+ return 0;
+ start_fsb += imap[n].br_blockcount;
+ count_fsb -= imap[n].br_blockcount;
+
+ if (imap[n].br_startblock == DELAYSTARTBLOCK)
+ found_delalloc = 1;
+ }
+ }
+ if (!found_delalloc)
+ *prealloc = 1;
+ return 0;
+}
+
+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip)
+{
+ xfs_fsblock_t alloc_blocks = 0;
+
+ if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+ int shift = 0;
+ int64_t freesp;
+
+ /*
+ * rounddown_pow_of_two() returns an undefined result
+ * if we pass in alloc_blocks = 0. Hence the "+ 1" to
+ * ensure we always pass in a non-zero value.
+ */
+ alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+ alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+ rounddown_pow_of_two(alloc_blocks));
+
+ xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+ freesp = mp->m_sb.sb_fdblocks;
+ if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+ shift = 2;
+ if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+ shift++;
+ if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+ shift++;
+ }
+ if (shift)
+ alloc_blocks >>= shift;
+ }
+
+ if (alloc_blocks < mp->m_writeio_blocks)
+ alloc_blocks = mp->m_writeio_blocks;
+
+ return alloc_blocks;
+}
+
+int
+xfs_iomap_write_delay(
+ xfs_inode_t *ip,
+ xfs_off_t offset,
+ size_t count,
+ xfs_bmbt_irec_t *ret_imap)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_off_t aligned_offset;
+ xfs_fileoff_t ioalign;
+ xfs_fsblock_t firstblock;
+ xfs_extlen_t extsz;
+ int nimaps;
+ xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
+ int prealloc, flushed = 0;
+ int error;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ /*
+ * Make sure that the dquots are there. This doesn't hold
+ * the ilock across a disk read.
+ */
+ error = xfs_qm_dqattach_locked(ip, 0);
+ if (error)
+ return XFS_ERROR(error);
+
+ extsz = xfs_get_extsz_hint(ip);
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+
+ error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
+ imap, XFS_WRITE_IMAPS, &prealloc);
+ if (error)
+ return error;
+
+retry:
+ if (prealloc) {
+ xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+
+ aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
+ ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+ last_fsb = ioalign + alloc_blocks;
+ } else {
+ last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+ }
+
+ if (prealloc || extsz) {
+ error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
+ if (error)
+ return error;
+ }
+
+ nimaps = XFS_WRITE_IMAPS;
+ firstblock = NULLFSBLOCK;
+ error = xfs_bmapi(NULL, ip, offset_fsb,
+ (xfs_filblks_t)(last_fsb - offset_fsb),
+ XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
+ XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
+ &nimaps, NULL);
+ switch (error) {
+ case 0:
+ case ENOSPC:
+ case EDQUOT:
+ break;
+ default:
+ return XFS_ERROR(error);
+ }
+
+ /*
+ * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
+ * ENOSPC, * flush all other inodes with delalloc blocks to free up
+ * some of the excess reserved metadata space. For both cases, retry
+ * without EOF preallocation.
+ */
+ if (nimaps == 0) {
+ trace_xfs_delalloc_enospc(ip, offset, count);
+ if (flushed)
+ return XFS_ERROR(error ? error : ENOSPC);
+
+ if (error == ENOSPC) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_flush_inodes(ip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ }
+
+ flushed = 1;
+ error = 0;
+ prealloc = 0;
+ goto retry;
+ }
+
+ if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
+ return xfs_alert_fsblock_zero(ip, &imap[0]);
+
+ *ret_imap = imap[0];
+ return 0;
+}
+
+/*
+ * Pass in a delayed allocate extent, convert it to real extents;
+ * return to the caller the extent we create which maps on top of
+ * the originating callers request.
+ *
+ * Called without a lock on the inode.
+ *
+ * We no longer bother to look at the incoming map - all we have to
+ * guarantee is that whatever we allocate fills the required range.
+ */
+int
+xfs_iomap_write_allocate(
+ xfs_inode_t *ip,
+ xfs_off_t offset,
+ size_t count,
+ xfs_bmbt_irec_t *imap)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb, last_block;
+ xfs_fileoff_t end_fsb, map_start_fsb;
+ xfs_fsblock_t first_block;
+ xfs_bmap_free_t free_list;
+ xfs_filblks_t count_fsb;
+ xfs_trans_t *tp;
+ int nimaps, committed;
+ int error = 0;
+ int nres;
+
+ /*
+ * Make sure that the dquots are there.
+ */
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return XFS_ERROR(error);
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ count_fsb = imap->br_blockcount;
+ map_start_fsb = imap->br_startoff;
+
+ XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+
+ while (count_fsb != 0) {
+ /*
+ * Set up a transaction with which to allocate the
+ * backing store for the file. Do allocations in a
+ * loop until we get some space in the range we are
+ * interested in. The other space that might be allocated
+ * is in the delayed allocation extent on which we sit
+ * but before our buffer starts.
+ */
+
+ nimaps = 0;
+ while (nimaps == 0) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ error = xfs_trans_reserve(tp, nres,
+ XFS_WRITE_LOG_RES(mp),
+ 0, XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return XFS_ERROR(error);
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip);
+
+ xfs_bmap_init(&free_list, &first_block);
+
+ /*
+ * it is possible that the extents have changed since
+ * we did the read call as we dropped the ilock for a
+ * while. We have to be careful about truncates or hole
+ * punchs here - we are not allowed to allocate
+ * non-delalloc blocks here.
+ *
+ * The only protection against truncation is the pages
+ * for the range we are being asked to convert are
+ * locked and hence a truncate will block on them
+ * first.
+ *
+ * As a result, if we go beyond the range we really
+ * need and hit an delalloc extent boundary followed by
+ * a hole while we have excess blocks in the map, we
+ * will fill the hole incorrectly and overrun the
+ * transaction reservation.
+ *
+ * Using a single map prevents this as we are forced to
+ * check each map we look for overlap with the desired
+ * range and abort as soon as we find it. Also, given
+ * that we only return a single map, having one beyond
+ * what we can return is probably a bit silly.
+ *
+ * We also need to check that we don't go beyond EOF;
+ * this is a truncate optimisation as a truncate sets
+ * the new file size before block on the pages we
+ * currently have locked under writeback. Because they
+ * are about to be tossed, we don't need to write them
+ * back....
+ */
+ nimaps = 1;
+ end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+ error = xfs_bmap_last_offset(NULL, ip, &last_block,
+ XFS_DATA_FORK);
+ if (error)
+ goto trans_cancel;
+
+ last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
+ if ((map_start_fsb + count_fsb) > last_block) {
+ count_fsb = last_block - map_start_fsb;
+ if (count_fsb == 0) {
+ error = EAGAIN;
+ goto trans_cancel;
+ }
+ }
+
+ /*
+ * Go get the actual blocks.
+ *
+ * From this point onwards we overwrite the imap
+ * pointer that the caller gave to us.
+ */
+ error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
+ XFS_BMAPI_WRITE, &first_block, 1,
+ imap, &nimaps, &free_list);
+ if (error)
+ goto trans_cancel;
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto trans_cancel;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error0;
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ /*
+ * See if we were able to allocate an extent that
+ * covers at least part of the callers request
+ */
+ if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+ return xfs_alert_fsblock_zero(ip, imap);
+
+ if ((offset_fsb >= imap->br_startoff) &&
+ (offset_fsb < (imap->br_startoff +
+ imap->br_blockcount))) {
+ XFS_STATS_INC(xs_xstrat_quick);
+ return 0;
+ }
+
+ /*
+ * So far we have not mapped the requested part of the
+ * file, just surrounding data, try again.
+ */
+ count_fsb -= imap->br_blockcount;
+ map_start_fsb = imap->br_startoff + imap->br_blockcount;
+ }
+
+trans_cancel:
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error0:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return XFS_ERROR(error);
+}
+
+int
+xfs_iomap_write_unwritten(
+ xfs_inode_t *ip,
+ xfs_off_t offset,
+ size_t count)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb;
+ xfs_filblks_t count_fsb;
+ xfs_filblks_t numblks_fsb;
+ xfs_fsblock_t firstfsb;
+ int nimaps;
+ xfs_trans_t *tp;
+ xfs_bmbt_irec_t imap;
+ xfs_bmap_free_t free_list;
+ uint resblks;
+ int committed;
+ int error;
+
+ trace_xfs_unwritten_convert(ip, offset, count);
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+ count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
+
+ /*
+ * Reserve enough blocks in this transaction for two complete extent
+ * btree splits. We may be converting the middle part of an unwritten
+ * extent and in this case we will insert two new extents in the btree
+ * each of which could cause a full split.
+ *
+ * This reservation amount will be used in the first call to
+ * xfs_bmbt_split() to select an AG with enough space to satisfy the
+ * rest of the operation.
+ */
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+
+ do {
+ /*
+ * set up a transaction to convert the range of extents
+ * from unwritten to real. Do allocations in a loop until
+ * we have covered the range passed in.
+ *
+ * Note that we open code the transaction allocation here
+ * to pass KM_NOFS--we can't risk to recursing back into
+ * the filesystem here as we might be asked to write out
+ * the same inode that we complete here and might deadlock
+ * on the iolock.
+ */
+ xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ error = xfs_trans_reserve(tp, resblks,
+ XFS_WRITE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return XFS_ERROR(error);
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip);
+
+ /*
+ * Modify the unwritten extent state of the buffer.
+ */
+ xfs_bmap_init(&free_list, &firstfsb);
+ nimaps = 1;
+ error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+ XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
+ 1, &imap, &nimaps, &free_list);
+ if (error)
+ goto error_on_bmapi_transaction;
+
+ error = xfs_bmap_finish(&(tp), &(free_list), &committed);
+ if (error)
+ goto error_on_bmapi_transaction;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return XFS_ERROR(error);
+
+ if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+ return xfs_alert_fsblock_zero(ip, &imap);
+
+ if ((numblks_fsb = imap.br_blockcount) == 0) {
+ /*
+ * The numblks_fsb value should always get
+ * smaller, otherwise the loop is stuck.
+ */
+ ASSERT(imap.br_blockcount);
+ break;
+ }
+ offset_fsb += numblks_fsb;
+ count_fsb -= numblks_fsb;
+ } while (count_fsb > 0);
+
+ return 0;
+
+error_on_bmapi_transaction:
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return XFS_ERROR(error);
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
new file mode 100644
index 00000000..80615760
--- /dev/null
+++ b/fs/xfs/xfs_iomap.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2003-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_IOMAP_H__
+#define __XFS_IOMAP_H__
+
+struct xfs_inode;
+struct xfs_bmbt_irec;
+
+extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+ struct xfs_bmbt_irec *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+ struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+ struct xfs_bmbt_irec *);
+extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
+
+#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
new file mode 100644
index 00000000..751e94fe
--- /dev/null
+++ b/fs/xfs/xfs_itable.c
@@ -0,0 +1,736 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_trace.h"
+
+STATIC int
+xfs_internal_inum(
+ xfs_mount_t *mp,
+ xfs_ino_t ino)
+{
+ return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+ (xfs_sb_version_hasquota(&mp->m_sb) &&
+ (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
+}
+
+/*
+ * Return stat information for one inode.
+ * Return 0 if ok, else errno.
+ */
+int
+xfs_bulkstat_one_int(
+ struct xfs_mount *mp, /* mount point for filesystem */
+ xfs_ino_t ino, /* inode to get data for */
+ void __user *buffer, /* buffer to place output in */
+ int ubsize, /* size of buffer */
+ bulkstat_one_fmt_pf formatter, /* formatter, copy to user */
+ int *ubused, /* bytes used by me */
+ int *stat) /* BULKSTAT_RV_... */
+{
+ struct xfs_icdinode *dic; /* dinode core info pointer */
+ struct xfs_inode *ip; /* incore inode pointer */
+ struct inode *inode;
+ struct xfs_bstat *buf; /* return buffer */
+ int error = 0; /* error value */
+
+ *stat = BULKSTAT_RV_NOTHING;
+
+ if (!buffer || xfs_internal_inum(mp, ino))
+ return XFS_ERROR(EINVAL);
+
+ buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
+ if (!buf)
+ return XFS_ERROR(ENOMEM);
+
+ error = xfs_iget(mp, NULL, ino,
+ XFS_IGET_UNTRUSTED, XFS_ILOCK_SHARED, &ip);
+ if (error) {
+ *stat = BULKSTAT_RV_NOTHING;
+ goto out_free;
+ }
+
+ ASSERT(ip != NULL);
+ ASSERT(ip->i_imap.im_blkno != 0);
+
+ dic = &ip->i_d;
+ inode = VFS_I(ip);
+
+ /* xfs_iget returns the following without needing
+ * further change.
+ */
+ buf->bs_nlink = dic->di_nlink;
+ buf->bs_projid_lo = dic->di_projid_lo;
+ buf->bs_projid_hi = dic->di_projid_hi;
+ buf->bs_ino = ino;
+ buf->bs_mode = dic->di_mode;
+ buf->bs_uid = dic->di_uid;
+ buf->bs_gid = dic->di_gid;
+ buf->bs_size = dic->di_size;
+
+ /*
+ * We need to read the timestamps from the Linux inode because
+ * the VFS keeps writing directly into the inode structure instead
+ * of telling us about the updates.
+ */
+ buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
+ buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
+ buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
+ buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
+ buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
+ buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+
+ buf->bs_xflags = xfs_ip2xflags(ip);
+ buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
+ buf->bs_extents = dic->di_nextents;
+ buf->bs_gen = dic->di_gen;
+ memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
+ buf->bs_dmevmask = dic->di_dmevmask;
+ buf->bs_dmstate = dic->di_dmstate;
+ buf->bs_aextents = dic->di_anextents;
+ buf->bs_forkoff = XFS_IFORK_BOFF(ip);
+
+ switch (dic->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ buf->bs_rdev = ip->i_df.if_u2.if_rdev;
+ buf->bs_blksize = BLKDEV_IOSIZE;
+ buf->bs_blocks = 0;
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_UUID:
+ buf->bs_rdev = 0;
+ buf->bs_blksize = mp->m_sb.sb_blocksize;
+ buf->bs_blocks = 0;
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ buf->bs_rdev = 0;
+ buf->bs_blksize = mp->m_sb.sb_blocksize;
+ buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
+ break;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ IRELE(ip);
+
+ error = formatter(buffer, ubsize, ubused, buf);
+
+ if (!error)
+ *stat = BULKSTAT_RV_DIDONE;
+
+ out_free:
+ kmem_free(buf);
+ return error;
+}
+
+/* Return 0 on success or positive error */
+STATIC int
+xfs_bulkstat_one_fmt(
+ void __user *ubuffer,
+ int ubsize,
+ int *ubused,
+ const xfs_bstat_t *buffer)
+{
+ if (ubsize < sizeof(*buffer))
+ return XFS_ERROR(ENOMEM);
+ if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
+ return XFS_ERROR(EFAULT);
+ if (ubused)
+ *ubused = sizeof(*buffer);
+ return 0;
+}
+
+int
+xfs_bulkstat_one(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t ino, /* inode number to get data for */
+ void __user *buffer, /* buffer to place output in */
+ int ubsize, /* size of buffer */
+ int *ubused, /* bytes used by me */
+ int *stat) /* BULKSTAT_RV_... */
+{
+ return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+ xfs_bulkstat_one_fmt, ubused, stat);
+}
+
+#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int /* error status */
+xfs_bulkstat(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t *lastinop, /* last inode returned */
+ int *ubcountp, /* size of buffer/count returned */
+ bulkstat_one_pf formatter, /* func that'd fill a single buf */
+ size_t statstruct_size, /* sizeof struct filling */
+ char __user *ubuffer, /* buffer with inode stats */
+ int *done) /* 1 if there are more stats to get */
+{
+ xfs_agblock_t agbno=0;/* allocation group block number */
+ xfs_buf_t *agbp; /* agi header buffer */
+ xfs_agi_t *agi; /* agi header data */
+ xfs_agino_t agino; /* inode # in allocation group */
+ xfs_agnumber_t agno; /* allocation group number */
+ int chunkidx; /* current index into inode chunk */
+ int clustidx; /* current index into inode cluster */
+ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
+ int end_of_ag; /* set if we've seen the ag end */
+ int error; /* error code */
+ int fmterror;/* bulkstat formatter result */
+ int i; /* loop index */
+ int icount; /* count of inodes good in irbuf */
+ size_t irbsize; /* size of irec buffer in bytes */
+ xfs_ino_t ino; /* inode number (filesystem) */
+ xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */
+ xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
+ xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */
+ xfs_ino_t lastino; /* last inode number returned */
+ int nbcluster; /* # of blocks in a cluster */
+ int nicluster; /* # of inodes in a cluster */
+ int nimask; /* mask for inode clusters */
+ int nirbuf; /* size of irbuf */
+ int rval; /* return value error code */
+ int tmp; /* result value from btree calls */
+ int ubcount; /* size of user's buffer */
+ int ubleft; /* bytes left in user's buffer */
+ char __user *ubufp; /* pointer into user's buffer */
+ int ubelem; /* spaces used in user's buffer */
+ int ubused; /* bytes used by formatter */
+ xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */
+
+ /*
+ * Get the last inode value, see if there's nothing to do.
+ */
+ ino = (xfs_ino_t)*lastinop;
+ lastino = ino;
+ agno = XFS_INO_TO_AGNO(mp, ino);
+ agino = XFS_INO_TO_AGINO(mp, ino);
+ if (agno >= mp->m_sb.sb_agcount ||
+ ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ *done = 1;
+ *ubcountp = 0;
+ return 0;
+ }
+ if (!ubcountp || *ubcountp <= 0) {
+ return EINVAL;
+ }
+ ubcount = *ubcountp; /* statstruct's */
+ ubleft = ubcount * statstruct_size; /* bytes */
+ *ubcountp = ubelem = 0;
+ *done = 0;
+ fmterror = 0;
+ ubufp = ubuffer;
+ nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
+ mp->m_sb.sb_inopblock :
+ (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
+ nimask = ~(nicluster - 1);
+ nbcluster = nicluster >> mp->m_sb.sb_inopblog;
+ irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
+ if (!irbuf)
+ return ENOMEM;
+
+ nirbuf = irbsize / sizeof(*irbuf);
+
+ /*
+ * Loop over the allocation groups, starting from the last
+ * inode returned; 0 means start of the allocation group.
+ */
+ rval = 0;
+ while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
+ cond_resched();
+ bp = NULL;
+ error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+ if (error) {
+ /*
+ * Skip this allocation group and go to the next one.
+ */
+ agno++;
+ agino = 0;
+ continue;
+ }
+ agi = XFS_BUF_TO_AGI(agbp);
+ /*
+ * Allocate and initialize a btree cursor for ialloc btree.
+ */
+ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+ irbp = irbuf;
+ irbufend = irbuf + nirbuf;
+ end_of_ag = 0;
+ /*
+ * If we're returning in the middle of an allocation group,
+ * we need to get the remainder of the chunk we're in.
+ */
+ if (agino > 0) {
+ xfs_inobt_rec_incore_t r;
+
+ /*
+ * Lookup the inode chunk that this inode lives in.
+ */
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
+ &tmp);
+ if (!error && /* no I/O error */
+ tmp && /* lookup succeeded */
+ /* got the record, should always work */
+ !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
+ i == 1 &&
+ /* this is the right chunk */
+ agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
+ /* lastino was not last in chunk */
+ (chunkidx = agino - r.ir_startino + 1) <
+ XFS_INODES_PER_CHUNK &&
+ /* there are some left allocated */
+ xfs_inobt_maskn(chunkidx,
+ XFS_INODES_PER_CHUNK - chunkidx) &
+ ~r.ir_free) {
+ /*
+ * Grab the chunk record. Mark all the
+ * uninteresting inodes (because they're
+ * before our start point) free.
+ */
+ for (i = 0; i < chunkidx; i++) {
+ if (XFS_INOBT_MASK(i) & ~r.ir_free)
+ r.ir_freecount++;
+ }
+ r.ir_free |= xfs_inobt_maskn(0, chunkidx);
+ irbp->ir_startino = r.ir_startino;
+ irbp->ir_freecount = r.ir_freecount;
+ irbp->ir_free = r.ir_free;
+ irbp++;
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK;
+ icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
+ } else {
+ /*
+ * If any of those tests failed, bump the
+ * inode number (just in case).
+ */
+ agino++;
+ icount = 0;
+ }
+ /*
+ * In any case, increment to the next record.
+ */
+ if (!error)
+ error = xfs_btree_increment(cur, 0, &tmp);
+ } else {
+ /*
+ * Start of ag. Lookup the first inode chunk.
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
+ icount = 0;
+ }
+ /*
+ * Loop through inode btree records in this ag,
+ * until we run out of inodes or space in the buffer.
+ */
+ while (irbp < irbufend && icount < ubcount) {
+ xfs_inobt_rec_incore_t r;
+
+ /*
+ * Loop as long as we're unable to read the
+ * inode btree.
+ */
+ while (error) {
+ agino += XFS_INODES_PER_CHUNK;
+ if (XFS_AGINO_TO_AGBNO(mp, agino) >=
+ be32_to_cpu(agi->agi_length))
+ break;
+ error = xfs_inobt_lookup(cur, agino,
+ XFS_LOOKUP_GE, &tmp);
+ cond_resched();
+ }
+ /*
+ * If ran off the end of the ag either with an error,
+ * or the normal way, set end and stop collecting.
+ */
+ if (error) {
+ end_of_ag = 1;
+ break;
+ }
+
+ error = xfs_inobt_get_rec(cur, &r, &i);
+ if (error || i == 0) {
+ end_of_ag = 1;
+ break;
+ }
+
+ /*
+ * If this chunk has any allocated inodes, save it.
+ * Also start read-ahead now for this chunk.
+ */
+ if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+ /*
+ * Loop over all clusters in the next chunk.
+ * Do a readahead if there are any allocated
+ * inodes in that cluster.
+ */
+ agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
+ for (chunkidx = 0;
+ chunkidx < XFS_INODES_PER_CHUNK;
+ chunkidx += nicluster,
+ agbno += nbcluster) {
+ if (xfs_inobt_maskn(chunkidx, nicluster)
+ & ~r.ir_free)
+ xfs_btree_reada_bufs(mp, agno,
+ agbno, nbcluster);
+ }
+ irbp->ir_startino = r.ir_startino;
+ irbp->ir_freecount = r.ir_freecount;
+ irbp->ir_free = r.ir_free;
+ irbp++;
+ icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
+ }
+ /*
+ * Set agino to after this chunk and bump the cursor.
+ */
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK;
+ error = xfs_btree_increment(cur, 0, &tmp);
+ cond_resched();
+ }
+ /*
+ * Drop the btree buffers and the agi buffer.
+ * We can't hold any of the locks these represent
+ * when calling iget.
+ */
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_buf_relse(agbp);
+ /*
+ * Now format all the good inodes into the user's buffer.
+ */
+ irbufend = irbp;
+ for (irbp = irbuf;
+ irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
+ /*
+ * Now process this chunk of inodes.
+ */
+ for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
+ XFS_BULKSTAT_UBLEFT(ubleft) &&
+ irbp->ir_freecount < XFS_INODES_PER_CHUNK;
+ chunkidx++, clustidx++, agino++) {
+ ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+ /*
+ * Recompute agbno if this is the
+ * first inode of the cluster.
+ *
+ * Careful with clustidx. There can be
+ * multiple clusters per chunk, a single
+ * cluster per chunk or a cluster that has
+ * inodes represented from several different
+ * chunks (if blocksize is large).
+ *
+ * Because of this, the starting clustidx is
+ * initialized to zero in this loop but must
+ * later be reset after reading in the cluster
+ * buffer.
+ */
+ if ((chunkidx & (nicluster - 1)) == 0) {
+ agbno = XFS_AGINO_TO_AGBNO(mp,
+ irbp->ir_startino) +
+ ((chunkidx & nimask) >>
+ mp->m_sb.sb_inopblog);
+ }
+ ino = XFS_AGINO_TO_INO(mp, agno, agino);
+ /*
+ * Skip if this inode is free.
+ */
+ if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
+ lastino = ino;
+ continue;
+ }
+ /*
+ * Count used inodes as free so we can tell
+ * when the chunk is used up.
+ */
+ irbp->ir_freecount++;
+
+ /*
+ * Get the inode and fill in a single buffer.
+ */
+ ubused = statstruct_size;
+ error = formatter(mp, ino, ubufp, ubleft,
+ &ubused, &fmterror);
+ if (fmterror == BULKSTAT_RV_NOTHING) {
+ if (error && error != ENOENT &&
+ error != EINVAL) {
+ ubleft = 0;
+ rval = error;
+ break;
+ }
+ lastino = ino;
+ continue;
+ }
+ if (fmterror == BULKSTAT_RV_GIVEUP) {
+ ubleft = 0;
+ ASSERT(error);
+ rval = error;
+ break;
+ }
+ if (ubufp)
+ ubufp += ubused;
+ ubleft -= ubused;
+ ubelem++;
+ lastino = ino;
+ }
+
+ cond_resched();
+ }
+
+ if (bp)
+ xfs_buf_relse(bp);
+
+ /*
+ * Set up for the next loop iteration.
+ */
+ if (XFS_BULKSTAT_UBLEFT(ubleft)) {
+ if (end_of_ag) {
+ agno++;
+ agino = 0;
+ } else
+ agino = XFS_INO_TO_AGINO(mp, lastino);
+ } else
+ break;
+ }
+ /*
+ * Done, we're either out of filesystem or space to put the data.
+ */
+ kmem_free_large(irbuf);
+ *ubcountp = ubelem;
+ /*
+ * Found some inodes, return them now and return the error next time.
+ */
+ if (ubelem)
+ rval = 0;
+ if (agno >= mp->m_sb.sb_agcount) {
+ /*
+ * If we ran out of filesystem, mark lastino as off
+ * the end of the filesystem, so the next call
+ * will return immediately.
+ */
+ *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
+ *done = 1;
+ } else
+ *lastinop = (xfs_ino_t)lastino;
+
+ return rval;
+}
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ * Special case for non-sequential one inode bulkstat.
+ */
+int /* error status */
+xfs_bulkstat_single(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t *lastinop, /* inode to return */
+ char __user *buffer, /* buffer with inode stats */
+ int *done) /* 1 if there are more stats to get */
+{
+ int count; /* count value for bulkstat call */
+ int error; /* return value */
+ xfs_ino_t ino; /* filesystem inode number */
+ int res; /* result from bs1 */
+
+ /*
+ * note that requesting valid inode numbers which are not allocated
+ * to inodes will most likely cause xfs_itobp to generate warning
+ * messages about bad magic numbers. This is ok. The fact that
+ * the inode isn't actually an inode is handled by the
+ * error check below. Done this way to make the usual case faster
+ * at the expense of the error case.
+ */
+
+ ino = (xfs_ino_t)*lastinop;
+ error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res);
+ if (error) {
+ /*
+ * Special case way failed, do it the "long" way
+ * to see if that works.
+ */
+ (*lastinop)--;
+ count = 1;
+ if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
+ sizeof(xfs_bstat_t), buffer, done))
+ return error;
+ if (count == 0 || (xfs_ino_t)*lastinop != ino)
+ return error == EFSCORRUPTED ?
+ XFS_ERROR(EINVAL) : error;
+ else
+ return 0;
+ }
+ *done = 0;
+ return 0;
+}
+
+int
+xfs_inumbers_fmt(
+ void __user *ubuffer, /* buffer to write to */
+ const xfs_inogrp_t *buffer, /* buffer to read from */
+ long count, /* # of elements to read */
+ long *written) /* # of bytes written */
+{
+ if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer)))
+ return -EFAULT;
+ *written = count * sizeof(*buffer);
+ return 0;
+}
+
+/*
+ * Return inode number table for the filesystem.
+ */
+int /* error status */
+xfs_inumbers(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t *lastino, /* last inode returned */
+ int *count, /* size of buffer/count returned */
+ void __user *ubuffer,/* buffer with inode descriptions */
+ inumbers_fmt_pf formatter)
+{
+ xfs_buf_t *agbp;
+ xfs_agino_t agino;
+ xfs_agnumber_t agno;
+ int bcount;
+ xfs_inogrp_t *buffer;
+ int bufidx;
+ xfs_btree_cur_t *cur;
+ int error;
+ xfs_inobt_rec_incore_t r;
+ int i;
+ xfs_ino_t ino;
+ int left;
+ int tmp;
+
+ ino = (xfs_ino_t)*lastino;
+ agno = XFS_INO_TO_AGNO(mp, ino);
+ agino = XFS_INO_TO_AGINO(mp, ino);
+ left = *count;
+ *count = 0;
+ bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));
+ buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
+ error = bufidx = 0;
+ cur = NULL;
+ agbp = NULL;
+ while (left > 0 && agno < mp->m_sb.sb_agcount) {
+ if (agbp == NULL) {
+ error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+ if (error) {
+ /*
+ * If we can't read the AGI of this ag,
+ * then just skip to the next one.
+ */
+ ASSERT(cur == NULL);
+ agbp = NULL;
+ agno++;
+ agino = 0;
+ continue;
+ }
+ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
+ &tmp);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ cur = NULL;
+ xfs_buf_relse(agbp);
+ agbp = NULL;
+ /*
+ * Move up the last inode in the current
+ * chunk. The lookup_ge will always get
+ * us the first inode in the next chunk.
+ */
+ agino += XFS_INODES_PER_CHUNK - 1;
+ continue;
+ }
+ }
+ error = xfs_inobt_get_rec(cur, &r, &i);
+ if (error || i == 0) {
+ xfs_buf_relse(agbp);
+ agbp = NULL;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = NULL;
+ agno++;
+ agino = 0;
+ continue;
+ }
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
+ buffer[bufidx].xi_startino =
+ XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
+ buffer[bufidx].xi_alloccount =
+ XFS_INODES_PER_CHUNK - r.ir_freecount;
+ buffer[bufidx].xi_allocmask = ~r.ir_free;
+ bufidx++;
+ left--;
+ if (bufidx == bcount) {
+ long written;
+ if (formatter(ubuffer, buffer, bufidx, &written)) {
+ error = XFS_ERROR(EFAULT);
+ break;
+ }
+ ubuffer += written;
+ *count += bufidx;
+ bufidx = 0;
+ }
+ if (left) {
+ error = xfs_btree_increment(cur, 0, &tmp);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ cur = NULL;
+ xfs_buf_relse(agbp);
+ agbp = NULL;
+ /*
+ * The agino value has already been bumped.
+ * Just try to skip up to it.
+ */
+ agino += XFS_INODES_PER_CHUNK;
+ continue;
+ }
+ }
+ }
+ if (!error) {
+ if (bufidx) {
+ long written;
+ if (formatter(ubuffer, buffer, bufidx, &written))
+ error = XFS_ERROR(EFAULT);
+ else
+ *count += bufidx;
+ }
+ *lastino = XFS_AGINO_TO_INO(mp, agno, agino);
+ }
+ kmem_free(buffer);
+ if (cur)
+ xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR));
+ if (agbp)
+ xfs_buf_relse(agbp);
+ return error;
+}
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
new file mode 100644
index 00000000..97295d91
--- /dev/null
+++ b/fs/xfs/xfs_itable.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ITABLE_H__
+#define __XFS_ITABLE_H__
+
+/*
+ * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat
+ * structures (by the dmi library). This is a pointer to a formatter function
+ * that will iget the inode and fill in the appropriate structure.
+ * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c
+ */
+typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
+ xfs_ino_t ino,
+ void __user *buffer,
+ int ubsize,
+ int *ubused,
+ int *stat);
+
+/*
+ * Values for stat return value.
+ */
+#define BULKSTAT_RV_NOTHING 0
+#define BULKSTAT_RV_DIDONE 1
+#define BULKSTAT_RV_GIVEUP 2
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int /* error status */
+xfs_bulkstat(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t *lastino, /* last inode returned */
+ int *count, /* size of buffer/count returned */
+ bulkstat_one_pf formatter, /* func that'd fill a single buf */
+ size_t statstruct_size,/* sizeof struct that we're filling */
+ char __user *ubuffer,/* buffer with inode stats */
+ int *done); /* 1 if there are more stats to get */
+
+int
+xfs_bulkstat_single(
+ xfs_mount_t *mp,
+ xfs_ino_t *lastinop,
+ char __user *buffer,
+ int *done);
+
+typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */
+ void __user *ubuffer, /* buffer to write to */
+ int ubsize, /* remaining user buffer sz */
+ int *ubused, /* bytes used by formatter */
+ const xfs_bstat_t *buffer); /* buffer to read from */
+
+int
+xfs_bulkstat_one_int(
+ xfs_mount_t *mp,
+ xfs_ino_t ino,
+ void __user *buffer,
+ int ubsize,
+ bulkstat_one_fmt_pf formatter,
+ int *ubused,
+ int *stat);
+
+int
+xfs_bulkstat_one(
+ xfs_mount_t *mp,
+ xfs_ino_t ino,
+ void __user *buffer,
+ int ubsize,
+ int *ubused,
+ int *stat);
+
+typedef int (*inumbers_fmt_pf)(
+ void __user *ubuffer, /* buffer to write to */
+ const xfs_inogrp_t *buffer, /* buffer to read from */
+ long count, /* # of elements to read */
+ long *written); /* # of bytes written */
+
+int
+xfs_inumbers_fmt(
+ void __user *ubuffer, /* buffer to write to */
+ const xfs_inogrp_t *buffer, /* buffer to read from */
+ long count, /* # of elements to read */
+ long *written); /* # of bytes written */
+
+int /* error status */
+xfs_inumbers(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t *last, /* last inode returned */
+ int *count, /* size of buffer/count returned */
+ void __user *buffer, /* buffer with inode info */
+ inumbers_fmt_pf formatter);
+
+#endif /* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
new file mode 100644
index 00000000..41d5b8f2
--- /dev/null
+++ b/fs/xfs/xfs_log.c
@@ -0,0 +1,3767 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_log_recover.h"
+#include "xfs_trans_priv.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_rw.h"
+#include "xfs_trace.h"
+
+kmem_zone_t *xfs_log_ticket_zone;
+
+/* Local miscellaneous function prototypes */
+STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
+ xlog_in_core_t **, xfs_lsn_t *);
+STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
+ xfs_buftarg_t *log_target,
+ xfs_daddr_t blk_offset,
+ int num_bblks);
+STATIC int xlog_space_left(struct log *log, atomic64_t *head);
+STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
+STATIC void xlog_dealloc_log(xlog_t *log);
+
+/* local state machine functions */
+STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
+STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog);
+STATIC int xlog_state_get_iclog_space(xlog_t *log,
+ int len,
+ xlog_in_core_t **iclog,
+ xlog_ticket_t *ticket,
+ int *continued_write,
+ int *logoffsetp);
+STATIC int xlog_state_release_iclog(xlog_t *log,
+ xlog_in_core_t *iclog);
+STATIC void xlog_state_switch_iclogs(xlog_t *log,
+ xlog_in_core_t *iclog,
+ int eventual_size);
+STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
+
+/* local functions to manipulate grant head */
+STATIC int xlog_grant_log_space(xlog_t *log,
+ xlog_ticket_t *xtic);
+STATIC void xlog_grant_push_ail(struct log *log,
+ int need_bytes);
+STATIC void xlog_regrant_reserve_log_space(xlog_t *log,
+ xlog_ticket_t *ticket);
+STATIC int xlog_regrant_write_log_space(xlog_t *log,
+ xlog_ticket_t *ticket);
+STATIC void xlog_ungrant_log_space(xlog_t *log,
+ xlog_ticket_t *ticket);
+
+#if defined(DEBUG)
+STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
+STATIC void xlog_verify_grant_tail(struct log *log);
+STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
+ int count, boolean_t syncing);
+STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
+ xfs_lsn_t tail_lsn);
+#else
+#define xlog_verify_dest_ptr(a,b)
+#define xlog_verify_grant_tail(a)
+#define xlog_verify_iclog(a,b,c,d)
+#define xlog_verify_tail_lsn(a,b,c)
+#endif
+
+STATIC int xlog_iclogs_empty(xlog_t *log);
+
+static void
+xlog_grant_sub_space(
+ struct log *log,
+ atomic64_t *head,
+ int bytes)
+{
+ int64_t head_val = atomic64_read(head);
+ int64_t new, old;
+
+ do {
+ int cycle, space;
+
+ xlog_crack_grant_head_val(head_val, &cycle, &space);
+
+ space -= bytes;
+ if (space < 0) {
+ space += log->l_logsize;
+ cycle--;
+ }
+
+ old = head_val;
+ new = xlog_assign_grant_head_val(cycle, space);
+ head_val = atomic64_cmpxchg(head, old, new);
+ } while (head_val != old);
+}
+
+static void
+xlog_grant_add_space(
+ struct log *log,
+ atomic64_t *head,
+ int bytes)
+{
+ int64_t head_val = atomic64_read(head);
+ int64_t new, old;
+
+ do {
+ int tmp;
+ int cycle, space;
+
+ xlog_crack_grant_head_val(head_val, &cycle, &space);
+
+ tmp = log->l_logsize - space;
+ if (tmp > bytes)
+ space += bytes;
+ else {
+ space = bytes - tmp;
+ cycle++;
+ }
+
+ old = head_val;
+ new = xlog_assign_grant_head_val(cycle, space);
+ head_val = atomic64_cmpxchg(head, old, new);
+ } while (head_val != old);
+}
+
+static void
+xlog_tic_reset_res(xlog_ticket_t *tic)
+{
+ tic->t_res_num = 0;
+ tic->t_res_arr_sum = 0;
+ tic->t_res_num_ophdrs = 0;
+}
+
+static void
+xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
+{
+ if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
+ /* add to overflow and start again */
+ tic->t_res_o_flow += tic->t_res_arr_sum;
+ tic->t_res_num = 0;
+ tic->t_res_arr_sum = 0;
+ }
+
+ tic->t_res_arr[tic->t_res_num].r_len = len;
+ tic->t_res_arr[tic->t_res_num].r_type = type;
+ tic->t_res_arr_sum += len;
+ tic->t_res_num++;
+}
+
+/*
+ * NOTES:
+ *
+ * 1. currblock field gets updated at startup and after in-core logs
+ * marked as with WANT_SYNC.
+ */
+
+/*
+ * This routine is called when a user of a log manager ticket is done with
+ * the reservation. If the ticket was ever used, then a commit record for
+ * the associated transaction is written out as a log operation header with
+ * no data. The flag XLOG_TIC_INITED is set when the first write occurs with
+ * a given ticket. If the ticket was one with a permanent reservation, then
+ * a few operations are done differently. Permanent reservation tickets by
+ * default don't release the reservation. They just commit the current
+ * transaction with the belief that the reservation is still needed. A flag
+ * must be passed in before permanent reservations are actually released.
+ * When these type of tickets are not released, they need to be set into
+ * the inited state again. By doing this, a start record will be written
+ * out when the next write occurs.
+ */
+xfs_lsn_t
+xfs_log_done(
+ struct xfs_mount *mp,
+ struct xlog_ticket *ticket,
+ struct xlog_in_core **iclog,
+ uint flags)
+{
+ struct log *log = mp->m_log;
+ xfs_lsn_t lsn = 0;
+
+ if (XLOG_FORCED_SHUTDOWN(log) ||
+ /*
+ * If nothing was ever written, don't write out commit record.
+ * If we get an error, just continue and give back the log ticket.
+ */
+ (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
+ (xlog_commit_record(log, ticket, iclog, &lsn)))) {
+ lsn = (xfs_lsn_t) -1;
+ if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
+ flags |= XFS_LOG_REL_PERM_RESERV;
+ }
+ }
+
+
+ if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
+ (flags & XFS_LOG_REL_PERM_RESERV)) {
+ trace_xfs_log_done_nonperm(log, ticket);
+
+ /*
+ * Release ticket if not permanent reservation or a specific
+ * request has been made to release a permanent reservation.
+ */
+ xlog_ungrant_log_space(log, ticket);
+ xfs_log_ticket_put(ticket);
+ } else {
+ trace_xfs_log_done_perm(log, ticket);
+
+ xlog_regrant_reserve_log_space(log, ticket);
+ /* If this ticket was a permanent reservation and we aren't
+ * trying to release it, reset the inited flags; so next time
+ * we write, a start record will be written out.
+ */
+ ticket->t_flags |= XLOG_TIC_INITED;
+ }
+
+ return lsn;
+}
+
+/*
+ * Attaches a new iclog I/O completion callback routine during
+ * transaction commit. If the log is in error state, a non-zero
+ * return code is handed back and the caller is responsible for
+ * executing the callback at an appropriate time.
+ */
+int
+xfs_log_notify(
+ struct xfs_mount *mp,
+ struct xlog_in_core *iclog,
+ xfs_log_callback_t *cb)
+{
+ int abortflg;
+
+ spin_lock(&iclog->ic_callback_lock);
+ abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
+ if (!abortflg) {
+ ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
+ (iclog->ic_state == XLOG_STATE_WANT_SYNC));
+ cb->cb_next = NULL;
+ *(iclog->ic_callback_tail) = cb;
+ iclog->ic_callback_tail = &(cb->cb_next);
+ }
+ spin_unlock(&iclog->ic_callback_lock);
+ return abortflg;
+}
+
+int
+xfs_log_release_iclog(
+ struct xfs_mount *mp,
+ struct xlog_in_core *iclog)
+{
+ if (xlog_state_release_iclog(mp->m_log, iclog)) {
+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ return EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * 1. Reserve an amount of on-disk log space and return a ticket corresponding
+ * to the reservation.
+ * 2. Potentially, push buffers at tail of log to disk.
+ *
+ * Each reservation is going to reserve extra space for a log record header.
+ * When writes happen to the on-disk log, we don't subtract the length of the
+ * log record header from any reservation. By wasting space in each
+ * reservation, we prevent over allocation problems.
+ */
+int
+xfs_log_reserve(
+ struct xfs_mount *mp,
+ int unit_bytes,
+ int cnt,
+ struct xlog_ticket **ticket,
+ __uint8_t client,
+ uint flags,
+ uint t_type)
+{
+ struct log *log = mp->m_log;
+ struct xlog_ticket *internal_ticket;
+ int retval = 0;
+
+ ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return XFS_ERROR(EIO);
+
+ XFS_STATS_INC(xs_try_logspace);
+
+
+ if (*ticket != NULL) {
+ ASSERT(flags & XFS_LOG_PERM_RESERV);
+ internal_ticket = *ticket;
+
+ /*
+ * this is a new transaction on the ticket, so we need to
+ * change the transaction ID so that the next transaction has a
+ * different TID in the log. Just add one to the existing tid
+ * so that we can see chains of rolling transactions in the log
+ * easily.
+ */
+ internal_ticket->t_tid++;
+
+ trace_xfs_log_reserve(log, internal_ticket);
+
+ xlog_grant_push_ail(log, internal_ticket->t_unit_res);
+ retval = xlog_regrant_write_log_space(log, internal_ticket);
+ } else {
+ /* may sleep if need to allocate more tickets */
+ internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
+ client, flags,
+ KM_SLEEP|KM_MAYFAIL);
+ if (!internal_ticket)
+ return XFS_ERROR(ENOMEM);
+ internal_ticket->t_trans_type = t_type;
+ *ticket = internal_ticket;
+
+ trace_xfs_log_reserve(log, internal_ticket);
+
+ xlog_grant_push_ail(log,
+ (internal_ticket->t_unit_res *
+ internal_ticket->t_cnt));
+ retval = xlog_grant_log_space(log, internal_ticket);
+ }
+
+ return retval;
+} /* xfs_log_reserve */
+
+
+/*
+ * Mount a log filesystem
+ *
+ * mp - ubiquitous xfs mount point structure
+ * log_target - buftarg of on-disk log device
+ * blk_offset - Start block # where block size is 512 bytes (BBSIZE)
+ * num_bblocks - Number of BBSIZE blocks in on-disk log
+ *
+ * Return error or zero.
+ */
+int
+xfs_log_mount(
+ xfs_mount_t *mp,
+ xfs_buftarg_t *log_target,
+ xfs_daddr_t blk_offset,
+ int num_bblks)
+{
+ int error;
+
+ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+ xfs_notice(mp, "Mounting Filesystem");
+ else {
+ xfs_notice(mp,
+"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent.");
+ ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ }
+
+ mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
+ if (IS_ERR(mp->m_log)) {
+ error = -PTR_ERR(mp->m_log);
+ goto out;
+ }
+
+ /*
+ * Initialize the AIL now we have a log.
+ */
+ error = xfs_trans_ail_init(mp);
+ if (error) {
+ xfs_warn(mp, "AIL initialisation failed: error %d", error);
+ goto out_free_log;
+ }
+ mp->m_log->l_ailp = mp->m_ail;
+
+ /*
+ * skip log recovery on a norecovery mount. pretend it all
+ * just worked.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+ int readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
+
+ if (readonly)
+ mp->m_flags &= ~XFS_MOUNT_RDONLY;
+
+ error = xlog_recover(mp->m_log);
+
+ if (readonly)
+ mp->m_flags |= XFS_MOUNT_RDONLY;
+ if (error) {
+ xfs_warn(mp, "log mount/recovery failed: error %d",
+ error);
+ goto out_destroy_ail;
+ }
+ }
+
+ /* Normal transactions can now occur */
+ mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+
+ /*
+ * Now the log has been fully initialised and we know were our
+ * space grant counters are, we can initialise the permanent ticket
+ * needed for delayed logging to work.
+ */
+ xlog_cil_init_post_recovery(mp->m_log);
+
+ return 0;
+
+out_destroy_ail:
+ xfs_trans_ail_destroy(mp);
+out_free_log:
+ xlog_dealloc_log(mp->m_log);
+out:
+ return error;
+}
+
+/*
+ * Finish the recovery of the file system. This is separate from
+ * the xfs_log_mount() call, because it depends on the code in
+ * xfs_mountfs() to read in the root and real-time bitmap inodes
+ * between calling xfs_log_mount() and here.
+ *
+ * mp - ubiquitous xfs mount point structure
+ */
+int
+xfs_log_mount_finish(xfs_mount_t *mp)
+{
+ int error;
+
+ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+ error = xlog_recover_finish(mp->m_log);
+ else {
+ error = 0;
+ ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ }
+
+ return error;
+}
+
+/*
+ * Final log writes as part of unmount.
+ *
+ * Mark the filesystem clean as unmount happens. Note that during relocation
+ * this routine needs to be executed as part of source-bag while the
+ * deallocation must not be done until source-end.
+ */
+
+/*
+ * Unmount record used to have a string "Unmount filesystem--" in the
+ * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
+ * We just write the magic number now since that particular field isn't
+ * currently architecture converted and "nUmount" is a bit foo.
+ * As far as I know, there weren't any dependencies on the old behaviour.
+ */
+
+int
+xfs_log_unmount_write(xfs_mount_t *mp)
+{
+ xlog_t *log = mp->m_log;
+ xlog_in_core_t *iclog;
+#ifdef DEBUG
+ xlog_in_core_t *first_iclog;
+#endif
+ xlog_ticket_t *tic = NULL;
+ xfs_lsn_t lsn;
+ int error;
+
+ /*
+ * Don't write out unmount record on read-only mounts.
+ * Or, if we are doing a forced umount (typically because of IO errors).
+ */
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return 0;
+
+ error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+ ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
+
+#ifdef DEBUG
+ first_iclog = iclog = log->l_iclog;
+ do {
+ if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+ ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
+ ASSERT(iclog->ic_offset == 0);
+ }
+ iclog = iclog->ic_next;
+ } while (iclog != first_iclog);
+#endif
+ if (! (XLOG_FORCED_SHUTDOWN(log))) {
+ error = xfs_log_reserve(mp, 600, 1, &tic,
+ XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
+ if (!error) {
+ /* the data section must be 32 bit size aligned */
+ struct {
+ __uint16_t magic;
+ __uint16_t pad1;
+ __uint32_t pad2; /* may as well make it 64 bits */
+ } magic = {
+ .magic = XLOG_UNMOUNT_TYPE,
+ };
+ struct xfs_log_iovec reg = {
+ .i_addr = &magic,
+ .i_len = sizeof(magic),
+ .i_type = XLOG_REG_TYPE_UNMOUNT,
+ };
+ struct xfs_log_vec vec = {
+ .lv_niovecs = 1,
+ .lv_iovecp = &reg,
+ };
+
+ /* remove inited flag */
+ tic->t_flags = 0;
+ error = xlog_write(log, &vec, tic, &lsn,
+ NULL, XLOG_UNMOUNT_TRANS);
+ /*
+ * At this point, we're umounting anyway,
+ * so there's no point in transitioning log state
+ * to IOERROR. Just continue...
+ */
+ }
+
+ if (error)
+ xfs_alert(mp, "%s: unmount record failed", __func__);
+
+
+ spin_lock(&log->l_icloglock);
+ iclog = log->l_iclog;
+ atomic_inc(&iclog->ic_refcnt);
+ xlog_state_want_sync(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ error = xlog_state_release_iclog(log, iclog);
+
+ spin_lock(&log->l_icloglock);
+ if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
+ iclog->ic_state == XLOG_STATE_DIRTY)) {
+ if (!XLOG_FORCED_SHUTDOWN(log)) {
+ xlog_wait(&iclog->ic_force_wait,
+ &log->l_icloglock);
+ } else {
+ spin_unlock(&log->l_icloglock);
+ }
+ } else {
+ spin_unlock(&log->l_icloglock);
+ }
+ if (tic) {
+ trace_xfs_log_umount_write(log, tic);
+ xlog_ungrant_log_space(log, tic);
+ xfs_log_ticket_put(tic);
+ }
+ } else {
+ /*
+ * We're already in forced_shutdown mode, couldn't
+ * even attempt to write out the unmount transaction.
+ *
+ * Go through the motions of sync'ing and releasing
+ * the iclog, even though no I/O will actually happen,
+ * we need to wait for other log I/Os that may already
+ * be in progress. Do this as a separate section of
+ * code so we'll know if we ever get stuck here that
+ * we're in this odd situation of trying to unmount
+ * a file system that went into forced_shutdown as
+ * the result of an unmount..
+ */
+ spin_lock(&log->l_icloglock);
+ iclog = log->l_iclog;
+ atomic_inc(&iclog->ic_refcnt);
+
+ xlog_state_want_sync(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ error = xlog_state_release_iclog(log, iclog);
+
+ spin_lock(&log->l_icloglock);
+
+ if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE
+ || iclog->ic_state == XLOG_STATE_DIRTY
+ || iclog->ic_state == XLOG_STATE_IOERROR) ) {
+
+ xlog_wait(&iclog->ic_force_wait,
+ &log->l_icloglock);
+ } else {
+ spin_unlock(&log->l_icloglock);
+ }
+ }
+
+ return error;
+} /* xfs_log_unmount_write */
+
+/*
+ * Deallocate log structures for unmount/relocation.
+ *
+ * We need to stop the aild from running before we destroy
+ * and deallocate the log as the aild references the log.
+ */
+void
+xfs_log_unmount(xfs_mount_t *mp)
+{
+ xfs_trans_ail_destroy(mp);
+ xlog_dealloc_log(mp->m_log);
+}
+
+void
+xfs_log_item_init(
+ struct xfs_mount *mp,
+ struct xfs_log_item *item,
+ int type,
+ struct xfs_item_ops *ops)
+{
+ item->li_mountp = mp;
+ item->li_ailp = mp->m_ail;
+ item->li_type = type;
+ item->li_ops = ops;
+ item->li_lv = NULL;
+
+ INIT_LIST_HEAD(&item->li_ail);
+ INIT_LIST_HEAD(&item->li_cil);
+}
+
+/*
+ * Write region vectors to log. The write happens using the space reservation
+ * of the ticket (tic). It is not a requirement that all writes for a given
+ * transaction occur with one call to xfs_log_write(). However, it is important
+ * to note that the transaction reservation code makes an assumption about the
+ * number of log headers a transaction requires that may be violated if you
+ * don't pass all the transaction vectors in one call....
+ */
+int
+xfs_log_write(
+ struct xfs_mount *mp,
+ struct xfs_log_iovec reg[],
+ int nentries,
+ struct xlog_ticket *tic,
+ xfs_lsn_t *start_lsn)
+{
+ struct log *log = mp->m_log;
+ int error;
+ struct xfs_log_vec vec = {
+ .lv_niovecs = nentries,
+ .lv_iovecp = reg,
+ };
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return XFS_ERROR(EIO);
+
+ error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ return error;
+}
+
+void
+xfs_log_move_tail(xfs_mount_t *mp,
+ xfs_lsn_t tail_lsn)
+{
+ xlog_ticket_t *tic;
+ xlog_t *log = mp->m_log;
+ int need_bytes, free_bytes;
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return;
+
+ if (tail_lsn == 0)
+ tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+
+ /* tail_lsn == 1 implies that we weren't passed a valid value. */
+ if (tail_lsn != 1)
+ atomic64_set(&log->l_tail_lsn, tail_lsn);
+
+ if (!list_empty_careful(&log->l_writeq)) {
+#ifdef DEBUG
+ if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+ panic("Recovery problem");
+#endif
+ spin_lock(&log->l_grant_write_lock);
+ free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+ list_for_each_entry(tic, &log->l_writeq, t_queue) {
+ ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+
+ if (free_bytes < tic->t_unit_res && tail_lsn != 1)
+ break;
+ tail_lsn = 0;
+ free_bytes -= tic->t_unit_res;
+ trace_xfs_log_regrant_write_wake_up(log, tic);
+ wake_up(&tic->t_wait);
+ }
+ spin_unlock(&log->l_grant_write_lock);
+ }
+
+ if (!list_empty_careful(&log->l_reserveq)) {
+#ifdef DEBUG
+ if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+ panic("Recovery problem");
+#endif
+ spin_lock(&log->l_grant_reserve_lock);
+ free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+ list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+ if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+ need_bytes = tic->t_unit_res*tic->t_cnt;
+ else
+ need_bytes = tic->t_unit_res;
+ if (free_bytes < need_bytes && tail_lsn != 1)
+ break;
+ tail_lsn = 0;
+ free_bytes -= need_bytes;
+ trace_xfs_log_grant_wake_up(log, tic);
+ wake_up(&tic->t_wait);
+ }
+ spin_unlock(&log->l_grant_reserve_lock);
+ }
+}
+
+/*
+ * Determine if we have a transaction that has gone to disk
+ * that needs to be covered. To begin the transition to the idle state
+ * firstly the log needs to be idle (no AIL and nothing in the iclogs).
+ * If we are then in a state where covering is needed, the caller is informed
+ * that dummy transactions are required to move the log into the idle state.
+ *
+ * Because this is called as part of the sync process, we should also indicate
+ * that dummy transactions should be issued in anything but the covered or
+ * idle states. This ensures that the log tail is accurately reflected in
+ * the log at the end of the sync, hence if a crash occurrs avoids replay
+ * of transactions where the metadata is already on disk.
+ */
+int
+xfs_log_need_covered(xfs_mount_t *mp)
+{
+ int needed = 0;
+ xlog_t *log = mp->m_log;
+
+ if (!xfs_fs_writable(mp))
+ return 0;
+
+ spin_lock(&log->l_icloglock);
+ switch (log->l_covered_state) {
+ case XLOG_STATE_COVER_DONE:
+ case XLOG_STATE_COVER_DONE2:
+ case XLOG_STATE_COVER_IDLE:
+ break;
+ case XLOG_STATE_COVER_NEED:
+ case XLOG_STATE_COVER_NEED2:
+ if (!xfs_ail_min_lsn(log->l_ailp) &&
+ xlog_iclogs_empty(log)) {
+ if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+ log->l_covered_state = XLOG_STATE_COVER_DONE;
+ else
+ log->l_covered_state = XLOG_STATE_COVER_DONE2;
+ }
+ /* FALLTHRU */
+ default:
+ needed = 1;
+ break;
+ }
+ spin_unlock(&log->l_icloglock);
+ return needed;
+}
+
+/******************************************************************************
+ *
+ * local routines
+ *
+ ******************************************************************************
+ */
+
+/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
+ * The log manager must keep track of the last LR which was committed
+ * to disk. The lsn of this LR will become the new tail_lsn whenever
+ * xfs_trans_tail_ail returns 0. If we don't do this, we run into
+ * the situation where stuff could be written into the log but nothing
+ * was ever in the AIL when asked. Eventually, we panic since the
+ * tail hits the head.
+ *
+ * We may be holding the log iclog lock upon entering this routine.
+ */
+xfs_lsn_t
+xlog_assign_tail_lsn(
+ struct xfs_mount *mp)
+{
+ xfs_lsn_t tail_lsn;
+ struct log *log = mp->m_log;
+
+ tail_lsn = xfs_ail_min_lsn(mp->m_ail);
+ if (!tail_lsn)
+ tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+
+ atomic64_set(&log->l_tail_lsn, tail_lsn);
+ return tail_lsn;
+}
+
+/*
+ * Return the space in the log between the tail and the head. The head
+ * is passed in the cycle/bytes formal parms. In the special case where
+ * the reserve head has wrapped passed the tail, this calculation is no
+ * longer valid. In this case, just return 0 which means there is no space
+ * in the log. This works for all places where this function is called
+ * with the reserve head. Of course, if the write head were to ever
+ * wrap the tail, we should blow up. Rather than catch this case here,
+ * we depend on other ASSERTions in other parts of the code. XXXmiken
+ *
+ * This code also handles the case where the reservation head is behind
+ * the tail. The details of this case are described below, but the end
+ * result is that we return the size of the log as the amount of space left.
+ */
+STATIC int
+xlog_space_left(
+ struct log *log,
+ atomic64_t *head)
+{
+ int free_bytes;
+ int tail_bytes;
+ int tail_cycle;
+ int head_cycle;
+ int head_bytes;
+
+ xlog_crack_grant_head(head, &head_cycle, &head_bytes);
+ xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
+ tail_bytes = BBTOB(tail_bytes);
+ if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
+ free_bytes = log->l_logsize - (head_bytes - tail_bytes);
+ else if (tail_cycle + 1 < head_cycle)
+ return 0;
+ else if (tail_cycle < head_cycle) {
+ ASSERT(tail_cycle == (head_cycle - 1));
+ free_bytes = tail_bytes - head_bytes;
+ } else {
+ /*
+ * The reservation head is behind the tail.
+ * In this case we just want to return the size of the
+ * log as the amount of space left.
+ */
+ xfs_alert(log->l_mp,
+ "xlog_space_left: head behind tail\n"
+ " tail_cycle = %d, tail_bytes = %d\n"
+ " GH cycle = %d, GH bytes = %d",
+ tail_cycle, tail_bytes, head_cycle, head_bytes);
+ ASSERT(0);
+ free_bytes = log->l_logsize;
+ }
+ return free_bytes;
+}
+
+
+/*
+ * Log function which is called when an io completes.
+ *
+ * The log manager needs its own routine, in order to control what
+ * happens with the buffer after the write completes.
+ */
+void
+xlog_iodone(xfs_buf_t *bp)
+{
+ xlog_in_core_t *iclog;
+ xlog_t *l;
+ int aborted;
+
+ iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
+ XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+ aborted = 0;
+ l = iclog->ic_log;
+
+ /*
+ * Race to shutdown the filesystem if we see an error.
+ */
+ if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
+ XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
+ xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
+ XFS_BUF_STALE(bp);
+ xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ /*
+ * This flag will be propagated to the trans-committed
+ * callback routines to let them know that the log-commit
+ * didn't succeed.
+ */
+ aborted = XFS_LI_ABORTED;
+ } else if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ aborted = XFS_LI_ABORTED;
+ }
+
+ /* log I/O is always issued ASYNC */
+ ASSERT(XFS_BUF_ISASYNC(bp));
+ xlog_state_done_syncing(iclog, aborted);
+ /*
+ * do not reference the buffer (bp) here as we could race
+ * with it being freed after writing the unmount record to the
+ * log.
+ */
+
+} /* xlog_iodone */
+
+/*
+ * Return size of each in-core log record buffer.
+ *
+ * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
+ *
+ * If the filesystem blocksize is too large, we may need to choose a
+ * larger size since the directory code currently logs entire blocks.
+ */
+
+STATIC void
+xlog_get_iclog_buffer_size(xfs_mount_t *mp,
+ xlog_t *log)
+{
+ int size;
+ int xhdrs;
+
+ if (mp->m_logbufs <= 0)
+ log->l_iclog_bufs = XLOG_MAX_ICLOGS;
+ else
+ log->l_iclog_bufs = mp->m_logbufs;
+
+ /*
+ * Buffer size passed in from mount system call.
+ */
+ if (mp->m_logbsize > 0) {
+ size = log->l_iclog_size = mp->m_logbsize;
+ log->l_iclog_size_log = 0;
+ while (size != 1) {
+ log->l_iclog_size_log++;
+ size >>= 1;
+ }
+
+ if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+ /* # headers = size / 32k
+ * one header holds cycles from 32k of data
+ */
+
+ xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
+ if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
+ xhdrs++;
+ log->l_iclog_hsize = xhdrs << BBSHIFT;
+ log->l_iclog_heads = xhdrs;
+ } else {
+ ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
+ log->l_iclog_hsize = BBSIZE;
+ log->l_iclog_heads = 1;
+ }
+ goto done;
+ }
+
+ /* All machines use 32kB buffers by default. */
+ log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
+ log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
+
+ /* the default log size is 16k or 32k which is one header sector */
+ log->l_iclog_hsize = BBSIZE;
+ log->l_iclog_heads = 1;
+
+done:
+ /* are we being asked to make the sizes selected above visible? */
+ if (mp->m_logbufs == 0)
+ mp->m_logbufs = log->l_iclog_bufs;
+ if (mp->m_logbsize == 0)
+ mp->m_logbsize = log->l_iclog_size;
+} /* xlog_get_iclog_buffer_size */
+
+
+/*
+ * This routine initializes some of the log structure for a given mount point.
+ * Its primary purpose is to fill in enough, so recovery can occur. However,
+ * some other stuff may be filled in too.
+ */
+STATIC xlog_t *
+xlog_alloc_log(xfs_mount_t *mp,
+ xfs_buftarg_t *log_target,
+ xfs_daddr_t blk_offset,
+ int num_bblks)
+{
+ xlog_t *log;
+ xlog_rec_header_t *head;
+ xlog_in_core_t **iclogp;
+ xlog_in_core_t *iclog, *prev_iclog=NULL;
+ xfs_buf_t *bp;
+ int i;
+ int error = ENOMEM;
+ uint log2_size = 0;
+
+ log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
+ if (!log) {
+ xfs_warn(mp, "Log allocation failed: No memory!");
+ goto out;
+ }
+
+ log->l_mp = mp;
+ log->l_targ = log_target;
+ log->l_logsize = BBTOB(num_bblks);
+ log->l_logBBstart = blk_offset;
+ log->l_logBBsize = num_bblks;
+ log->l_covered_state = XLOG_STATE_COVER_IDLE;
+ log->l_flags |= XLOG_ACTIVE_RECOVERY;
+
+ log->l_prev_block = -1;
+ /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
+ xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
+ xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
+ log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
+ xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
+ xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
+ INIT_LIST_HEAD(&log->l_reserveq);
+ INIT_LIST_HEAD(&log->l_writeq);
+ spin_lock_init(&log->l_grant_reserve_lock);
+ spin_lock_init(&log->l_grant_write_lock);
+
+ error = EFSCORRUPTED;
+ if (xfs_sb_version_hassector(&mp->m_sb)) {
+ log2_size = mp->m_sb.sb_logsectlog;
+ if (log2_size < BBSHIFT) {
+ xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
+ log2_size, BBSHIFT);
+ goto out_free_log;
+ }
+
+ log2_size -= BBSHIFT;
+ if (log2_size > mp->m_sectbb_log) {
+ xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
+ log2_size, mp->m_sectbb_log);
+ goto out_free_log;
+ }
+
+ /* for larger sector sizes, must have v2 or external log */
+ if (log2_size && log->l_logBBstart > 0 &&
+ !xfs_sb_version_haslogv2(&mp->m_sb)) {
+ xfs_warn(mp,
+ "log sector size (0x%x) invalid for configuration.",
+ log2_size);
+ goto out_free_log;
+ }
+ }
+ log->l_sectBBsize = 1 << log2_size;
+
+ xlog_get_iclog_buffer_size(mp, log);
+
+ error = ENOMEM;
+ bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+ if (!bp)
+ goto out_free_log;
+ XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
+ XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+ log->l_xbuf = bp;
+
+ spin_lock_init(&log->l_icloglock);
+ init_waitqueue_head(&log->l_flush_wait);
+
+ /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
+ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
+
+ iclogp = &log->l_iclog;
+ /*
+ * The amount of memory to allocate for the iclog structure is
+ * rather funky due to the way the structure is defined. It is
+ * done this way so that we can use different sizes for machines
+ * with different amounts of memory. See the definition of
+ * xlog_in_core_t in xfs_log_priv.h for details.
+ */
+ ASSERT(log->l_iclog_size >= 4096);
+ for (i=0; i < log->l_iclog_bufs; i++) {
+ *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
+ if (!*iclogp)
+ goto out_free_iclog;
+
+ iclog = *iclogp;
+ iclog->ic_prev = prev_iclog;
+ prev_iclog = iclog;
+
+ bp = xfs_buf_get_uncached(mp->m_logdev_targp,
+ log->l_iclog_size, 0);
+ if (!bp)
+ goto out_free_iclog;
+ if (!XFS_BUF_CPSEMA(bp))
+ ASSERT(0);
+ XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
+ XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+ iclog->ic_bp = bp;
+ iclog->ic_data = bp->b_addr;
+#ifdef DEBUG
+ log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+#endif
+ head = &iclog->ic_header;
+ memset(head, 0, sizeof(xlog_rec_header_t));
+ head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
+ head->h_version = cpu_to_be32(
+ xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
+ head->h_size = cpu_to_be32(log->l_iclog_size);
+ /* new fields */
+ head->h_fmt = cpu_to_be32(XLOG_FMT);
+ memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+
+ iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
+ iclog->ic_state = XLOG_STATE_ACTIVE;
+ iclog->ic_log = log;
+ atomic_set(&iclog->ic_refcnt, 0);
+ spin_lock_init(&iclog->ic_callback_lock);
+ iclog->ic_callback_tail = &(iclog->ic_callback);
+ iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
+
+ ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
+ ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
+ init_waitqueue_head(&iclog->ic_force_wait);
+ init_waitqueue_head(&iclog->ic_write_wait);
+
+ iclogp = &iclog->ic_next;
+ }
+ *iclogp = log->l_iclog; /* complete ring */
+ log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
+
+ error = xlog_cil_init(log);
+ if (error)
+ goto out_free_iclog;
+ return log;
+
+out_free_iclog:
+ for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
+ prev_iclog = iclog->ic_next;
+ if (iclog->ic_bp)
+ xfs_buf_free(iclog->ic_bp);
+ kmem_free(iclog);
+ }
+ spinlock_destroy(&log->l_icloglock);
+ xfs_buf_free(log->l_xbuf);
+out_free_log:
+ kmem_free(log);
+out:
+ return ERR_PTR(-error);
+} /* xlog_alloc_log */
+
+
+/*
+ * Write out the commit record of a transaction associated with the given
+ * ticket. Return the lsn of the commit record.
+ */
+STATIC int
+xlog_commit_record(
+ struct log *log,
+ struct xlog_ticket *ticket,
+ struct xlog_in_core **iclog,
+ xfs_lsn_t *commitlsnp)
+{
+ struct xfs_mount *mp = log->l_mp;
+ int error;
+ struct xfs_log_iovec reg = {
+ .i_addr = NULL,
+ .i_len = 0,
+ .i_type = XLOG_REG_TYPE_COMMIT,
+ };
+ struct xfs_log_vec vec = {
+ .lv_niovecs = 1,
+ .lv_iovecp = &reg,
+ };
+
+ ASSERT_ALWAYS(iclog);
+ error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
+ XLOG_COMMIT_TRANS);
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ return error;
+}
+
+/*
+ * Push on the buffer cache code if we ever use more than 75% of the on-disk
+ * log space. This code pushes on the lsn which would supposedly free up
+ * the 25% which we want to leave free. We may need to adopt a policy which
+ * pushes on an lsn which is further along in the log once we reach the high
+ * water mark. In this manner, we would be creating a low water mark.
+ */
+STATIC void
+xlog_grant_push_ail(
+ struct log *log,
+ int need_bytes)
+{
+ xfs_lsn_t threshold_lsn = 0;
+ xfs_lsn_t last_sync_lsn;
+ int free_blocks;
+ int free_bytes;
+ int threshold_block;
+ int threshold_cycle;
+ int free_threshold;
+
+ ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+
+ free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+ free_blocks = BTOBBT(free_bytes);
+
+ /*
+ * Set the threshold for the minimum number of free blocks in the
+ * log to the maximum of what the caller needs, one quarter of the
+ * log, and 256 blocks.
+ */
+ free_threshold = BTOBB(need_bytes);
+ free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+ free_threshold = MAX(free_threshold, 256);
+ if (free_blocks >= free_threshold)
+ return;
+
+ xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
+ &threshold_block);
+ threshold_block += free_threshold;
+ if (threshold_block >= log->l_logBBsize) {
+ threshold_block -= log->l_logBBsize;
+ threshold_cycle += 1;
+ }
+ threshold_lsn = xlog_assign_lsn(threshold_cycle,
+ threshold_block);
+ /*
+ * Don't pass in an lsn greater than the lsn of the last
+ * log record known to be on disk. Use a snapshot of the last sync lsn
+ * so that it doesn't change between the compare and the set.
+ */
+ last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+ if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+ threshold_lsn = last_sync_lsn;
+
+ /*
+ * Get the transaction layer to kick the dirty buffers out to
+ * disk asynchronously. No point in trying to do this if
+ * the filesystem is shutting down.
+ */
+ if (!XLOG_FORCED_SHUTDOWN(log))
+ xfs_ail_push(log->l_ailp, threshold_lsn);
+}
+
+/*
+ * The bdstrat callback function for log bufs. This gives us a central
+ * place to trap bufs in case we get hit by a log I/O error and need to
+ * shutdown. Actually, in practice, even when we didn't get a log error,
+ * we transition the iclogs to IOERROR state *after* flushing all existing
+ * iclogs to disk. This is because we don't want anymore new transactions to be
+ * started or completed afterwards.
+ */
+STATIC int
+xlog_bdstrat(
+ struct xfs_buf *bp)
+{
+ struct xlog_in_core *iclog;
+
+ iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+ if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ XFS_BUF_ERROR(bp, EIO);
+ XFS_BUF_STALE(bp);
+ xfs_buf_ioend(bp, 0);
+ /*
+ * It would seem logical to return EIO here, but we rely on
+ * the log state machine to propagate I/O errors instead of
+ * doing it here.
+ */
+ return 0;
+ }
+
+ bp->b_flags |= _XBF_RUN_QUEUES;
+ xfs_buf_iorequest(bp);
+ return 0;
+}
+
+/*
+ * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
+ * fashion. Previously, we should have moved the current iclog
+ * ptr in the log to point to the next available iclog. This allows further
+ * write to continue while this code syncs out an iclog ready to go.
+ * Before an in-core log can be written out, the data section must be scanned
+ * to save away the 1st word of each BBSIZE block into the header. We replace
+ * it with the current cycle count. Each BBSIZE block is tagged with the
+ * cycle count because there in an implicit assumption that drives will
+ * guarantee that entire 512 byte blocks get written at once. In other words,
+ * we can't have part of a 512 byte block written and part not written. By
+ * tagging each block, we will know which blocks are valid when recovering
+ * after an unclean shutdown.
+ *
+ * This routine is single threaded on the iclog. No other thread can be in
+ * this routine with the same iclog. Changing contents of iclog can there-
+ * fore be done without grabbing the state machine lock. Updating the global
+ * log will require grabbing the lock though.
+ *
+ * The entire log manager uses a logical block numbering scheme. Only
+ * log_sync (and then only bwrite()) know about the fact that the log may
+ * not start with block zero on a given device. The log block start offset
+ * is added immediately before calling bwrite().
+ */
+
+STATIC int
+xlog_sync(xlog_t *log,
+ xlog_in_core_t *iclog)
+{
+ xfs_caddr_t dptr; /* pointer to byte sized element */
+ xfs_buf_t *bp;
+ int i;
+ uint count; /* byte count of bwrite */
+ uint count_init; /* initial count before roundup */
+ int roundoff; /* roundoff to BB or stripe */
+ int split = 0; /* split write into two regions */
+ int error;
+ int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
+
+ XFS_STATS_INC(xs_log_writes);
+ ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+
+ /* Add for LR header */
+ count_init = log->l_iclog_hsize + iclog->ic_offset;
+
+ /* Round out the log write size */
+ if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
+ /* we have a v2 stripe unit to use */
+ count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
+ } else {
+ count = BBTOB(BTOBB(count_init));
+ }
+ roundoff = count - count_init;
+ ASSERT(roundoff >= 0);
+ ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 &&
+ roundoff < log->l_mp->m_sb.sb_logsunit)
+ ||
+ (log->l_mp->m_sb.sb_logsunit <= 1 &&
+ roundoff < BBTOB(1)));
+
+ /* move grant heads by roundoff in sync */
+ xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
+ xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
+
+ /* put cycle number in every block */
+ xlog_pack_data(log, iclog, roundoff);
+
+ /* real byte length */
+ if (v2) {
+ iclog->ic_header.h_len =
+ cpu_to_be32(iclog->ic_offset + roundoff);
+ } else {
+ iclog->ic_header.h_len =
+ cpu_to_be32(iclog->ic_offset);
+ }
+
+ bp = iclog->ic_bp;
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1);
+ XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
+ XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
+
+ XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
+
+ /* Do we need to split this write into 2 parts? */
+ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+ split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
+ count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
+ iclog->ic_bwritecnt = 2; /* split into 2 writes */
+ } else {
+ iclog->ic_bwritecnt = 1;
+ }
+ XFS_BUF_SET_COUNT(bp, count);
+ XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */
+ XFS_BUF_ZEROFLAGS(bp);
+ XFS_BUF_BUSY(bp);
+ XFS_BUF_ASYNC(bp);
+ bp->b_flags |= XBF_LOG_BUFFER;
+
+ if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
+ /*
+ * If we have an external log device, flush the data device
+ * before flushing the log to make sure all meta data
+ * written back from the AIL actually made it to disk
+ * before writing out the new log tail LSN in the log buffer.
+ */
+ if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+ xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
+ XFS_BUF_ORDERED(bp);
+ }
+
+ ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+ ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+
+ xlog_verify_iclog(log, iclog, count, B_TRUE);
+
+ /* account for log which doesn't start at block #0 */
+ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+ /*
+ * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
+ * is shutting down.
+ */
+ XFS_BUF_WRITE(bp);
+
+ if ((error = xlog_bdstrat(bp))) {
+ xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
+ XFS_BUF_ADDR(bp));
+ return error;
+ }
+ if (split) {
+ bp = iclog->ic_log->l_xbuf;
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) ==
+ (unsigned long)1);
+ XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
+ XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */
+ XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+
+ (__psint_t)count), split);
+ XFS_BUF_SET_FSPRIVATE(bp, iclog);
+ XFS_BUF_ZEROFLAGS(bp);
+ XFS_BUF_BUSY(bp);
+ XFS_BUF_ASYNC(bp);
+ bp->b_flags |= XBF_LOG_BUFFER;
+ if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+ XFS_BUF_ORDERED(bp);
+ dptr = XFS_BUF_PTR(bp);
+ /*
+ * Bump the cycle numbers at the start of each block
+ * since this part of the buffer is at the start of
+ * a new cycle. Watch out for the header magic number
+ * case, though.
+ */
+ for (i = 0; i < split; i += BBSIZE) {
+ be32_add_cpu((__be32 *)dptr, 1);
+ if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
+ be32_add_cpu((__be32 *)dptr, 1);
+ dptr += BBSIZE;
+ }
+
+ ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+ ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+
+ /* account for internal log which doesn't start at block #0 */
+ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+ XFS_BUF_WRITE(bp);
+ if ((error = xlog_bdstrat(bp))) {
+ xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
+ bp, XFS_BUF_ADDR(bp));
+ return error;
+ }
+ }
+ return 0;
+} /* xlog_sync */
+
+
+/*
+ * Deallocate a log structure
+ */
+STATIC void
+xlog_dealloc_log(xlog_t *log)
+{
+ xlog_in_core_t *iclog, *next_iclog;
+ int i;
+
+ xlog_cil_destroy(log);
+
+ /*
+ * always need to ensure that the extra buffer does not point to memory
+ * owned by another log buffer before we free it.
+ */
+ xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
+ xfs_buf_free(log->l_xbuf);
+
+ iclog = log->l_iclog;
+ for (i=0; i<log->l_iclog_bufs; i++) {
+ xfs_buf_free(iclog->ic_bp);
+ next_iclog = iclog->ic_next;
+ kmem_free(iclog);
+ iclog = next_iclog;
+ }
+ spinlock_destroy(&log->l_icloglock);
+
+ log->l_mp->m_log = NULL;
+ kmem_free(log);
+} /* xlog_dealloc_log */
+
+/*
+ * Update counters atomically now that memcpy is done.
+ */
+/* ARGSUSED */
+static inline void
+xlog_state_finish_copy(xlog_t *log,
+ xlog_in_core_t *iclog,
+ int record_cnt,
+ int copy_bytes)
+{
+ spin_lock(&log->l_icloglock);
+
+ be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
+ iclog->ic_offset += copy_bytes;
+
+ spin_unlock(&log->l_icloglock);
+} /* xlog_state_finish_copy */
+
+
+
+
+/*
+ * print out info relating to regions written which consume
+ * the reservation
+ */
+void
+xlog_print_tic_res(
+ struct xfs_mount *mp,
+ struct xlog_ticket *ticket)
+{
+ uint i;
+ uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
+
+ /* match with XLOG_REG_TYPE_* in xfs_log.h */
+ static char *res_type_str[XLOG_REG_TYPE_MAX] = {
+ "bformat",
+ "bchunk",
+ "efi_format",
+ "efd_format",
+ "iformat",
+ "icore",
+ "iext",
+ "ibroot",
+ "ilocal",
+ "iattr_ext",
+ "iattr_broot",
+ "iattr_local",
+ "qformat",
+ "dquot",
+ "quotaoff",
+ "LR header",
+ "unmount",
+ "commit",
+ "trans header"
+ };
+ static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
+ "SETATTR_NOT_SIZE",
+ "SETATTR_SIZE",
+ "INACTIVE",
+ "CREATE",
+ "CREATE_TRUNC",
+ "TRUNCATE_FILE",
+ "REMOVE",
+ "LINK",
+ "RENAME",
+ "MKDIR",
+ "RMDIR",
+ "SYMLINK",
+ "SET_DMATTRS",
+ "GROWFS",
+ "STRAT_WRITE",
+ "DIOSTRAT",
+ "WRITE_SYNC",
+ "WRITEID",
+ "ADDAFORK",
+ "ATTRINVAL",
+ "ATRUNCATE",
+ "ATTR_SET",
+ "ATTR_RM",
+ "ATTR_FLAG",
+ "CLEAR_AGI_BUCKET",
+ "QM_SBCHANGE",
+ "DUMMY1",
+ "DUMMY2",
+ "QM_QUOTAOFF",
+ "QM_DQALLOC",
+ "QM_SETQLIM",
+ "QM_DQCLUSTER",
+ "QM_QINOCREATE",
+ "QM_QUOTAOFF_END",
+ "SB_UNIT",
+ "FSYNC_TS",
+ "GROWFSRT_ALLOC",
+ "GROWFSRT_ZERO",
+ "GROWFSRT_FREE",
+ "SWAPEXT"
+ };
+
+ xfs_warn(mp,
+ "xfs_log_write: reservation summary:\n"
+ " trans type = %s (%u)\n"
+ " unit res = %d bytes\n"
+ " current res = %d bytes\n"
+ " total reg = %u bytes (o/flow = %u bytes)\n"
+ " ophdrs = %u (ophdr space = %u bytes)\n"
+ " ophdr + reg = %u bytes\n"
+ " num regions = %u\n",
+ ((ticket->t_trans_type <= 0 ||
+ ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+ "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+ ticket->t_trans_type,
+ ticket->t_unit_res,
+ ticket->t_curr_res,
+ ticket->t_res_arr_sum, ticket->t_res_o_flow,
+ ticket->t_res_num_ophdrs, ophdr_spc,
+ ticket->t_res_arr_sum +
+ ticket->t_res_o_flow + ophdr_spc,
+ ticket->t_res_num);
+
+ for (i = 0; i < ticket->t_res_num; i++) {
+ uint r_type = ticket->t_res_arr[i].r_type;
+ xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
+ ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
+ "bad-rtype" : res_type_str[r_type-1]),
+ ticket->t_res_arr[i].r_len);
+ }
+
+ xfs_alert_tag(mp, XFS_PTAG_LOGRES,
+ "xfs_log_write: reservation ran out. Need to up reservation");
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+}
+
+/*
+ * Calculate the potential space needed by the log vector. Each region gets
+ * its own xlog_op_header_t and may need to be double word aligned.
+ */
+static int
+xlog_write_calc_vec_length(
+ struct xlog_ticket *ticket,
+ struct xfs_log_vec *log_vector)
+{
+ struct xfs_log_vec *lv;
+ int headers = 0;
+ int len = 0;
+ int i;
+
+ /* acct for start rec of xact */
+ if (ticket->t_flags & XLOG_TIC_INITED)
+ headers++;
+
+ for (lv = log_vector; lv; lv = lv->lv_next) {
+ headers += lv->lv_niovecs;
+
+ for (i = 0; i < lv->lv_niovecs; i++) {
+ struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
+
+ len += vecp->i_len;
+ xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
+ }
+ }
+
+ ticket->t_res_num_ophdrs += headers;
+ len += headers * sizeof(struct xlog_op_header);
+
+ return len;
+}
+
+/*
+ * If first write for transaction, insert start record We can't be trying to
+ * commit if we are inited. We can't have any "partial_copy" if we are inited.
+ */
+static int
+xlog_write_start_rec(
+ struct xlog_op_header *ophdr,
+ struct xlog_ticket *ticket)
+{
+ if (!(ticket->t_flags & XLOG_TIC_INITED))
+ return 0;
+
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_clientid = ticket->t_clientid;
+ ophdr->oh_len = 0;
+ ophdr->oh_flags = XLOG_START_TRANS;
+ ophdr->oh_res2 = 0;
+
+ ticket->t_flags &= ~XLOG_TIC_INITED;
+
+ return sizeof(struct xlog_op_header);
+}
+
+static xlog_op_header_t *
+xlog_write_setup_ophdr(
+ struct log *log,
+ struct xlog_op_header *ophdr,
+ struct xlog_ticket *ticket,
+ uint flags)
+{
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_clientid = ticket->t_clientid;
+ ophdr->oh_res2 = 0;
+
+ /* are we copying a commit or unmount record? */
+ ophdr->oh_flags = flags;
+
+ /*
+ * We've seen logs corrupted with bad transaction client ids. This
+ * makes sure that XFS doesn't generate them on. Turn this into an EIO
+ * and shut down the filesystem.
+ */
+ switch (ophdr->oh_clientid) {
+ case XFS_TRANSACTION:
+ case XFS_VOLUME:
+ case XFS_LOG:
+ break;
+ default:
+ xfs_warn(log->l_mp,
+ "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+ ophdr->oh_clientid, ticket);
+ return NULL;
+ }
+
+ return ophdr;
+}
+
+/*
+ * Set up the parameters of the region copy into the log. This has
+ * to handle region write split across multiple log buffers - this
+ * state is kept external to this function so that this code can
+ * can be written in an obvious, self documenting manner.
+ */
+static int
+xlog_write_setup_copy(
+ struct xlog_ticket *ticket,
+ struct xlog_op_header *ophdr,
+ int space_available,
+ int space_required,
+ int *copy_off,
+ int *copy_len,
+ int *last_was_partial_copy,
+ int *bytes_consumed)
+{
+ int still_to_copy;
+
+ still_to_copy = space_required - *bytes_consumed;
+ *copy_off = *bytes_consumed;
+
+ if (still_to_copy <= space_available) {
+ /* write of region completes here */
+ *copy_len = still_to_copy;
+ ophdr->oh_len = cpu_to_be32(*copy_len);
+ if (*last_was_partial_copy)
+ ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+ *last_was_partial_copy = 0;
+ *bytes_consumed = 0;
+ return 0;
+ }
+
+ /* partial write of region, needs extra log op header reservation */
+ *copy_len = space_available;
+ ophdr->oh_len = cpu_to_be32(*copy_len);
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+ if (*last_was_partial_copy)
+ ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
+ *bytes_consumed += *copy_len;
+ (*last_was_partial_copy)++;
+
+ /* account for new log op header */
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
+ ticket->t_res_num_ophdrs++;
+
+ return sizeof(struct xlog_op_header);
+}
+
+static int
+xlog_write_copy_finish(
+ struct log *log,
+ struct xlog_in_core *iclog,
+ uint flags,
+ int *record_cnt,
+ int *data_cnt,
+ int *partial_copy,
+ int *partial_copy_len,
+ int log_offset,
+ struct xlog_in_core **commit_iclog)
+{
+ if (*partial_copy) {
+ /*
+ * This iclog has already been marked WANT_SYNC by
+ * xlog_state_get_iclog_space.
+ */
+ xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+ *record_cnt = 0;
+ *data_cnt = 0;
+ return xlog_state_release_iclog(log, iclog);
+ }
+
+ *partial_copy = 0;
+ *partial_copy_len = 0;
+
+ if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+ /* no more space in this iclog - push it. */
+ xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+ *record_cnt = 0;
+ *data_cnt = 0;
+
+ spin_lock(&log->l_icloglock);
+ xlog_state_want_sync(log, iclog);
+ spin_unlock(&log->l_icloglock);
+
+ if (!commit_iclog)
+ return xlog_state_release_iclog(log, iclog);
+ ASSERT(flags & XLOG_COMMIT_TRANS);
+ *commit_iclog = iclog;
+ }
+
+ return 0;
+}
+
+/*
+ * Write some region out to in-core log
+ *
+ * This will be called when writing externally provided regions or when
+ * writing out a commit record for a given transaction.
+ *
+ * General algorithm:
+ * 1. Find total length of this write. This may include adding to the
+ * lengths passed in.
+ * 2. Check whether we violate the tickets reservation.
+ * 3. While writing to this iclog
+ * A. Reserve as much space in this iclog as can get
+ * B. If this is first write, save away start lsn
+ * C. While writing this region:
+ * 1. If first write of transaction, write start record
+ * 2. Write log operation header (header per region)
+ * 3. Find out if we can fit entire region into this iclog
+ * 4. Potentially, verify destination memcpy ptr
+ * 5. Memcpy (partial) region
+ * 6. If partial copy, release iclog; otherwise, continue
+ * copying more regions into current iclog
+ * 4. Mark want sync bit (in simulation mode)
+ * 5. Release iclog for potential flush to on-disk log.
+ *
+ * ERRORS:
+ * 1. Panic if reservation is overrun. This should never happen since
+ * reservation amounts are generated internal to the filesystem.
+ * NOTES:
+ * 1. Tickets are single threaded data structures.
+ * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
+ * syncing routine. When a single log_write region needs to span
+ * multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
+ * on all log operation writes which don't contain the end of the
+ * region. The XLOG_END_TRANS bit is used for the in-core log
+ * operation which contains the end of the continued log_write region.
+ * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
+ * we don't really know exactly how much space will be used. As a result,
+ * we don't update ic_offset until the end when we know exactly how many
+ * bytes have been written out.
+ */
+int
+xlog_write(
+ struct log *log,
+ struct xfs_log_vec *log_vector,
+ struct xlog_ticket *ticket,
+ xfs_lsn_t *start_lsn,
+ struct xlog_in_core **commit_iclog,
+ uint flags)
+{
+ struct xlog_in_core *iclog = NULL;
+ struct xfs_log_iovec *vecp;
+ struct xfs_log_vec *lv;
+ int len;
+ int index;
+ int partial_copy = 0;
+ int partial_copy_len = 0;
+ int contwr = 0;
+ int record_cnt = 0;
+ int data_cnt = 0;
+ int error;
+
+ *start_lsn = 0;
+
+ len = xlog_write_calc_vec_length(ticket, log_vector);
+ if (log->l_cilp) {
+ /*
+ * Region headers and bytes are already accounted for.
+ * We only need to take into account start records and
+ * split regions in this function.
+ */
+ if (ticket->t_flags & XLOG_TIC_INITED)
+ ticket->t_curr_res -= sizeof(xlog_op_header_t);
+
+ /*
+ * Commit record headers need to be accounted for. These
+ * come in as separate writes so are easy to detect.
+ */
+ if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+ ticket->t_curr_res -= sizeof(xlog_op_header_t);
+ } else
+ ticket->t_curr_res -= len;
+
+ if (ticket->t_curr_res < 0)
+ xlog_print_tic_res(log->l_mp, ticket);
+
+ index = 0;
+ lv = log_vector;
+ vecp = lv->lv_iovecp;
+ while (lv && index < lv->lv_niovecs) {
+ void *ptr;
+ int log_offset;
+
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ &contwr, &log_offset);
+ if (error)
+ return error;
+
+ ASSERT(log_offset <= iclog->ic_size - 1);
+ ptr = iclog->ic_datap + log_offset;
+
+ /* start_lsn is the first lsn written to. That's all we need. */
+ if (!*start_lsn)
+ *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+
+ /*
+ * This loop writes out as many regions as can fit in the amount
+ * of space which was allocated by xlog_state_get_iclog_space().
+ */
+ while (lv && index < lv->lv_niovecs) {
+ struct xfs_log_iovec *reg = &vecp[index];
+ struct xlog_op_header *ophdr;
+ int start_rec_copy;
+ int copy_len;
+ int copy_off;
+
+ ASSERT(reg->i_len % sizeof(__int32_t) == 0);
+ ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
+
+ start_rec_copy = xlog_write_start_rec(ptr, ticket);
+ if (start_rec_copy) {
+ record_cnt++;
+ xlog_write_adv_cnt(&ptr, &len, &log_offset,
+ start_rec_copy);
+ }
+
+ ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
+ if (!ophdr)
+ return XFS_ERROR(EIO);
+
+ xlog_write_adv_cnt(&ptr, &len, &log_offset,
+ sizeof(struct xlog_op_header));
+
+ len += xlog_write_setup_copy(ticket, ophdr,
+ iclog->ic_size-log_offset,
+ reg->i_len,
+ &copy_off, &copy_len,
+ &partial_copy,
+ &partial_copy_len);
+ xlog_verify_dest_ptr(log, ptr);
+
+ /* copy region */
+ ASSERT(copy_len >= 0);
+ memcpy(ptr, reg->i_addr + copy_off, copy_len);
+ xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
+
+ copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+ record_cnt++;
+ data_cnt += contwr ? copy_len : 0;
+
+ error = xlog_write_copy_finish(log, iclog, flags,
+ &record_cnt, &data_cnt,
+ &partial_copy,
+ &partial_copy_len,
+ log_offset,
+ commit_iclog);
+ if (error)
+ return error;
+
+ /*
+ * if we had a partial copy, we need to get more iclog
+ * space but we don't want to increment the region
+ * index because there is still more is this region to
+ * write.
+ *
+ * If we completed writing this region, and we flushed
+ * the iclog (indicated by resetting of the record
+ * count), then we also need to get more log space. If
+ * this was the last record, though, we are done and
+ * can just return.
+ */
+ if (partial_copy)
+ break;
+
+ if (++index == lv->lv_niovecs) {
+ lv = lv->lv_next;
+ index = 0;
+ if (lv)
+ vecp = lv->lv_iovecp;
+ }
+ if (record_cnt == 0) {
+ if (!lv)
+ return 0;
+ break;
+ }
+ }
+ }
+
+ ASSERT(len == 0);
+
+ xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+ if (!commit_iclog)
+ return xlog_state_release_iclog(log, iclog);
+
+ ASSERT(flags & XLOG_COMMIT_TRANS);
+ *commit_iclog = iclog;
+ return 0;
+}
+
+
+/*****************************************************************************
+ *
+ * State Machine functions
+ *
+ *****************************************************************************
+ */
+
+/* Clean iclogs starting from the head. This ordering must be
+ * maintained, so an iclog doesn't become ACTIVE beyond one that
+ * is SYNCING. This is also required to maintain the notion that we use
+ * a ordered wait queue to hold off would be writers to the log when every
+ * iclog is trying to sync to disk.
+ *
+ * State Change: DIRTY -> ACTIVE
+ */
+STATIC void
+xlog_state_clean_log(xlog_t *log)
+{
+ xlog_in_core_t *iclog;
+ int changed = 0;
+
+ iclog = log->l_iclog;
+ do {
+ if (iclog->ic_state == XLOG_STATE_DIRTY) {
+ iclog->ic_state = XLOG_STATE_ACTIVE;
+ iclog->ic_offset = 0;
+ ASSERT(iclog->ic_callback == NULL);
+ /*
+ * If the number of ops in this iclog indicate it just
+ * contains the dummy transaction, we can
+ * change state into IDLE (the second time around).
+ * Otherwise we should change the state into
+ * NEED a dummy.
+ * We don't need to cover the dummy.
+ */
+ if (!changed &&
+ (be32_to_cpu(iclog->ic_header.h_num_logops) ==
+ XLOG_COVER_OPS)) {
+ changed = 1;
+ } else {
+ /*
+ * We have two dirty iclogs so start over
+ * This could also be num of ops indicates
+ * this is not the dummy going out.
+ */
+ changed = 2;
+ }
+ iclog->ic_header.h_num_logops = 0;
+ memset(iclog->ic_header.h_cycle_data, 0,
+ sizeof(iclog->ic_header.h_cycle_data));
+ iclog->ic_header.h_lsn = 0;
+ } else if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ /* do nothing */;
+ else
+ break; /* stop cleaning */
+ iclog = iclog->ic_next;
+ } while (iclog != log->l_iclog);
+
+ /* log is locked when we are called */
+ /*
+ * Change state for the dummy log recording.
+ * We usually go to NEED. But we go to NEED2 if the changed indicates
+ * we are done writing the dummy record.
+ * If we are done with the second dummy recored (DONE2), then
+ * we go to IDLE.
+ */
+ if (changed) {
+ switch (log->l_covered_state) {
+ case XLOG_STATE_COVER_IDLE:
+ case XLOG_STATE_COVER_NEED:
+ case XLOG_STATE_COVER_NEED2:
+ log->l_covered_state = XLOG_STATE_COVER_NEED;
+ break;
+
+ case XLOG_STATE_COVER_DONE:
+ if (changed == 1)
+ log->l_covered_state = XLOG_STATE_COVER_NEED2;
+ else
+ log->l_covered_state = XLOG_STATE_COVER_NEED;
+ break;
+
+ case XLOG_STATE_COVER_DONE2:
+ if (changed == 1)
+ log->l_covered_state = XLOG_STATE_COVER_IDLE;
+ else
+ log->l_covered_state = XLOG_STATE_COVER_NEED;
+ break;
+
+ default:
+ ASSERT(0);
+ }
+ }
+} /* xlog_state_clean_log */
+
+STATIC xfs_lsn_t
+xlog_get_lowest_lsn(
+ xlog_t *log)
+{
+ xlog_in_core_t *lsn_log;
+ xfs_lsn_t lowest_lsn, lsn;
+
+ lsn_log = log->l_iclog;
+ lowest_lsn = 0;
+ do {
+ if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
+ lsn = be64_to_cpu(lsn_log->ic_header.h_lsn);
+ if ((lsn && !lowest_lsn) ||
+ (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
+ lowest_lsn = lsn;
+ }
+ }
+ lsn_log = lsn_log->ic_next;
+ } while (lsn_log != log->l_iclog);
+ return lowest_lsn;
+}
+
+
+STATIC void
+xlog_state_do_callback(
+ xlog_t *log,
+ int aborted,
+ xlog_in_core_t *ciclog)
+{
+ xlog_in_core_t *iclog;
+ xlog_in_core_t *first_iclog; /* used to know when we've
+ * processed all iclogs once */
+ xfs_log_callback_t *cb, *cb_next;
+ int flushcnt = 0;
+ xfs_lsn_t lowest_lsn;
+ int ioerrors; /* counter: iclogs with errors */
+ int loopdidcallbacks; /* flag: inner loop did callbacks*/
+ int funcdidcallbacks; /* flag: function did callbacks */
+ int repeats; /* for issuing console warnings if
+ * looping too many times */
+ int wake = 0;
+
+ spin_lock(&log->l_icloglock);
+ first_iclog = iclog = log->l_iclog;
+ ioerrors = 0;
+ funcdidcallbacks = 0;
+ repeats = 0;
+
+ do {
+ /*
+ * Scan all iclogs starting with the one pointed to by the
+ * log. Reset this starting point each time the log is
+ * unlocked (during callbacks).
+ *
+ * Keep looping through iclogs until one full pass is made
+ * without running any callbacks.
+ */
+ first_iclog = log->l_iclog;
+ iclog = log->l_iclog;
+ loopdidcallbacks = 0;
+ repeats++;
+
+ do {
+
+ /* skip all iclogs in the ACTIVE & DIRTY states */
+ if (iclog->ic_state &
+ (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
+ iclog = iclog->ic_next;
+ continue;
+ }
+
+ /*
+ * Between marking a filesystem SHUTDOWN and stopping
+ * the log, we do flush all iclogs to disk (if there
+ * wasn't a log I/O error). So, we do want things to
+ * go smoothly in case of just a SHUTDOWN w/o a
+ * LOG_IO_ERROR.
+ */
+ if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+ /*
+ * Can only perform callbacks in order. Since
+ * this iclog is not in the DONE_SYNC/
+ * DO_CALLBACK state, we skip the rest and
+ * just try to clean up. If we set our iclog
+ * to DO_CALLBACK, we will not process it when
+ * we retry since a previous iclog is in the
+ * CALLBACK and the state cannot change since
+ * we are holding the l_icloglock.
+ */
+ if (!(iclog->ic_state &
+ (XLOG_STATE_DONE_SYNC |
+ XLOG_STATE_DO_CALLBACK))) {
+ if (ciclog && (ciclog->ic_state ==
+ XLOG_STATE_DONE_SYNC)) {
+ ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
+ }
+ break;
+ }
+ /*
+ * We now have an iclog that is in either the
+ * DO_CALLBACK or DONE_SYNC states. The other
+ * states (WANT_SYNC, SYNCING, or CALLBACK were
+ * caught by the above if and are going to
+ * clean (i.e. we aren't doing their callbacks)
+ * see the above if.
+ */
+
+ /*
+ * We will do one more check here to see if we
+ * have chased our tail around.
+ */
+
+ lowest_lsn = xlog_get_lowest_lsn(log);
+ if (lowest_lsn &&
+ XFS_LSN_CMP(lowest_lsn,
+ be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
+ iclog = iclog->ic_next;
+ continue; /* Leave this iclog for
+ * another thread */
+ }
+
+ iclog->ic_state = XLOG_STATE_CALLBACK;
+
+
+ /*
+ * update the last_sync_lsn before we drop the
+ * icloglock to ensure we are the only one that
+ * can update it.
+ */
+ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+ be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
+ atomic64_set(&log->l_last_sync_lsn,
+ be64_to_cpu(iclog->ic_header.h_lsn));
+
+ } else
+ ioerrors++;
+
+ spin_unlock(&log->l_icloglock);
+
+ /*
+ * Keep processing entries in the callback list until
+ * we come around and it is empty. We need to
+ * atomically see that the list is empty and change the
+ * state to DIRTY so that we don't miss any more
+ * callbacks being added.
+ */
+ spin_lock(&iclog->ic_callback_lock);
+ cb = iclog->ic_callback;
+ while (cb) {
+ iclog->ic_callback_tail = &(iclog->ic_callback);
+ iclog->ic_callback = NULL;
+ spin_unlock(&iclog->ic_callback_lock);
+
+ /* perform callbacks in the order given */
+ for (; cb; cb = cb_next) {
+ cb_next = cb->cb_next;
+ cb->cb_func(cb->cb_arg, aborted);
+ }
+ spin_lock(&iclog->ic_callback_lock);
+ cb = iclog->ic_callback;
+ }
+
+ loopdidcallbacks++;
+ funcdidcallbacks++;
+
+ spin_lock(&log->l_icloglock);
+ ASSERT(iclog->ic_callback == NULL);
+ spin_unlock(&iclog->ic_callback_lock);
+ if (!(iclog->ic_state & XLOG_STATE_IOERROR))
+ iclog->ic_state = XLOG_STATE_DIRTY;
+
+ /*
+ * Transition from DIRTY to ACTIVE if applicable.
+ * NOP if STATE_IOERROR.
+ */
+ xlog_state_clean_log(log);
+
+ /* wake up threads waiting in xfs_log_force() */
+ wake_up_all(&iclog->ic_force_wait);
+
+ iclog = iclog->ic_next;
+ } while (first_iclog != iclog);
+
+ if (repeats > 5000) {
+ flushcnt += repeats;
+ repeats = 0;
+ xfs_warn(log->l_mp,
+ "%s: possible infinite loop (%d iterations)",
+ __func__, flushcnt);
+ }
+ } while (!ioerrors && loopdidcallbacks);
+
+ /*
+ * make one last gasp attempt to see if iclogs are being left in
+ * limbo..
+ */
+#ifdef DEBUG
+ if (funcdidcallbacks) {
+ first_iclog = iclog = log->l_iclog;
+ do {
+ ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+ /*
+ * Terminate the loop if iclogs are found in states
+ * which will cause other threads to clean up iclogs.
+ *
+ * SYNCING - i/o completion will go through logs
+ * DONE_SYNC - interrupt thread should be waiting for
+ * l_icloglock
+ * IOERROR - give up hope all ye who enter here
+ */
+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_state == XLOG_STATE_SYNCING ||
+ iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+ iclog->ic_state == XLOG_STATE_IOERROR )
+ break;
+ iclog = iclog->ic_next;
+ } while (first_iclog != iclog);
+ }
+#endif
+
+ if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
+ wake = 1;
+ spin_unlock(&log->l_icloglock);
+
+ if (wake)
+ wake_up_all(&log->l_flush_wait);
+}
+
+
+/*
+ * Finish transitioning this iclog to the dirty state.
+ *
+ * Make sure that we completely execute this routine only when this is
+ * the last call to the iclog. There is a good chance that iclog flushes,
+ * when we reach the end of the physical log, get turned into 2 separate
+ * calls to bwrite. Hence, one iclog flush could generate two calls to this
+ * routine. By using the reference count bwritecnt, we guarantee that only
+ * the second completion goes through.
+ *
+ * Callbacks could take time, so they are done outside the scope of the
+ * global state machine log lock.
+ */
+STATIC void
+xlog_state_done_syncing(
+ xlog_in_core_t *iclog,
+ int aborted)
+{
+ xlog_t *log = iclog->ic_log;
+
+ spin_lock(&log->l_icloglock);
+
+ ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
+ iclog->ic_state == XLOG_STATE_IOERROR);
+ ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
+ ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
+
+
+ /*
+ * If we got an error, either on the first buffer, or in the case of
+ * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
+ * and none should ever be attempted to be written to disk
+ * again.
+ */
+ if (iclog->ic_state != XLOG_STATE_IOERROR) {
+ if (--iclog->ic_bwritecnt == 1) {
+ spin_unlock(&log->l_icloglock);
+ return;
+ }
+ iclog->ic_state = XLOG_STATE_DONE_SYNC;
+ }
+
+ /*
+ * Someone could be sleeping prior to writing out the next
+ * iclog buffer, we wake them all, one will get to do the
+ * I/O, the others get to wait for the result.
+ */
+ wake_up_all(&iclog->ic_write_wait);
+ spin_unlock(&log->l_icloglock);
+ xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
+} /* xlog_state_done_syncing */
+
+
+/*
+ * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
+ * sleep. We wait on the flush queue on the head iclog as that should be
+ * the first iclog to complete flushing. Hence if all iclogs are syncing,
+ * we will wait here and all new writes will sleep until a sync completes.
+ *
+ * The in-core logs are used in a circular fashion. They are not used
+ * out-of-order even when an iclog past the head is free.
+ *
+ * return:
+ * * log_offset where xlog_write() can start writing into the in-core
+ * log's data space.
+ * * in-core log pointer to which xlog_write() should write.
+ * * boolean indicating this is a continued write to an in-core log.
+ * If this is the last write, then the in-core log's offset field
+ * needs to be incremented, depending on the amount of data which
+ * is copied.
+ */
+STATIC int
+xlog_state_get_iclog_space(xlog_t *log,
+ int len,
+ xlog_in_core_t **iclogp,
+ xlog_ticket_t *ticket,
+ int *continued_write,
+ int *logoffsetp)
+{
+ int log_offset;
+ xlog_rec_header_t *head;
+ xlog_in_core_t *iclog;
+ int error;
+
+restart:
+ spin_lock(&log->l_icloglock);
+ if (XLOG_FORCED_SHUTDOWN(log)) {
+ spin_unlock(&log->l_icloglock);
+ return XFS_ERROR(EIO);
+ }
+
+ iclog = log->l_iclog;
+ if (iclog->ic_state != XLOG_STATE_ACTIVE) {
+ XFS_STATS_INC(xs_log_noiclogs);
+
+ /* Wait for log writes to have flushed */
+ xlog_wait(&log->l_flush_wait, &log->l_icloglock);
+ goto restart;
+ }
+
+ head = &iclog->ic_header;
+
+ atomic_inc(&iclog->ic_refcnt); /* prevents sync */
+ log_offset = iclog->ic_offset;
+
+ /* On the 1st write to an iclog, figure out lsn. This works
+ * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
+ * committing to. If the offset is set, that's how many blocks
+ * must be written.
+ */
+ if (log_offset == 0) {
+ ticket->t_curr_res -= log->l_iclog_hsize;
+ xlog_tic_add_region(ticket,
+ log->l_iclog_hsize,
+ XLOG_REG_TYPE_LRHEADER);
+ head->h_cycle = cpu_to_be32(log->l_curr_cycle);
+ head->h_lsn = cpu_to_be64(
+ xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
+ ASSERT(log->l_curr_block >= 0);
+ }
+
+ /* If there is enough room to write everything, then do it. Otherwise,
+ * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
+ * bit is on, so this will get flushed out. Don't update ic_offset
+ * until you know exactly how many bytes get copied. Therefore, wait
+ * until later to update ic_offset.
+ *
+ * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
+ * can fit into remaining data section.
+ */
+ if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+ xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+
+ /*
+ * If I'm the only one writing to this iclog, sync it to disk.
+ * We need to do an atomic compare and decrement here to avoid
+ * racing with concurrent atomic_dec_and_lock() calls in
+ * xlog_state_release_iclog() when there is more than one
+ * reference to the iclog.
+ */
+ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
+ /* we are the only one */
+ spin_unlock(&log->l_icloglock);
+ error = xlog_state_release_iclog(log, iclog);
+ if (error)
+ return error;
+ } else {
+ spin_unlock(&log->l_icloglock);
+ }
+ goto restart;
+ }
+
+ /* Do we have enough room to write the full amount in the remainder
+ * of this iclog? Or must we continue a write on the next iclog and
+ * mark this iclog as completely taken? In the case where we switch
+ * iclogs (to mark it taken), this particular iclog will release/sync
+ * to disk in xlog_write().
+ */
+ if (len <= iclog->ic_size - iclog->ic_offset) {
+ *continued_write = 0;
+ iclog->ic_offset += len;
+ } else {
+ *continued_write = 1;
+ xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+ }
+ *iclogp = iclog;
+
+ ASSERT(iclog->ic_offset <= iclog->ic_size);
+ spin_unlock(&log->l_icloglock);
+
+ *logoffsetp = log_offset;
+ return 0;
+} /* xlog_state_get_iclog_space */
+
+/*
+ * Atomically get the log space required for a log ticket.
+ *
+ * Once a ticket gets put onto the reserveq, it will only return after
+ * the needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off the reserveq under the
+ * l_grant_reserve_lock, we only need to take that lock if we are going
+ * to add the ticket to the queue and sleep. We can avoid taking the lock if the
+ * ticket was never added to the reserveq because the t_queue list head will be
+ * empty and we hold the only reference to it so it can safely be checked
+ * unlocked.
+ */
+STATIC int
+xlog_grant_log_space(xlog_t *log,
+ xlog_ticket_t *tic)
+{
+ int free_bytes;
+ int need_bytes;
+
+#ifdef DEBUG
+ if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+ panic("grant Recovery problem");
+#endif
+
+ trace_xfs_log_grant_enter(log, tic);
+
+ need_bytes = tic->t_unit_res;
+ if (tic->t_flags & XFS_LOG_PERM_RESERV)
+ need_bytes *= tic->t_ocnt;
+
+ /* something is already sleeping; insert new transaction at end */
+ if (!list_empty_careful(&log->l_reserveq)) {
+ spin_lock(&log->l_grant_reserve_lock);
+ /* recheck the queue now we are locked */
+ if (list_empty(&log->l_reserveq)) {
+ spin_unlock(&log->l_grant_reserve_lock);
+ goto redo;
+ }
+ list_add_tail(&tic->t_queue, &log->l_reserveq);
+
+ trace_xfs_log_grant_sleep1(log, tic);
+
+ /*
+ * Gotta check this before going to sleep, while we're
+ * holding the grant lock.
+ */
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto error_return;
+
+ XFS_STATS_INC(xs_sleep_logspace);
+ xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+
+ /*
+ * If we got an error, and the filesystem is shutting down,
+ * we'll catch it down below. So just continue...
+ */
+ trace_xfs_log_grant_wake1(log, tic);
+ }
+
+redo:
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto error_return_unlocked;
+
+ free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+ if (free_bytes < need_bytes) {
+ spin_lock(&log->l_grant_reserve_lock);
+ if (list_empty(&tic->t_queue))
+ list_add_tail(&tic->t_queue, &log->l_reserveq);
+
+ trace_xfs_log_grant_sleep2(log, tic);
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto error_return;
+
+ xlog_grant_push_ail(log, need_bytes);
+
+ XFS_STATS_INC(xs_sleep_logspace);
+ xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+
+ trace_xfs_log_grant_wake2(log, tic);
+ goto redo;
+ }
+
+ if (!list_empty(&tic->t_queue)) {
+ spin_lock(&log->l_grant_reserve_lock);
+ list_del_init(&tic->t_queue);
+ spin_unlock(&log->l_grant_reserve_lock);
+ }
+
+ /* we've got enough space */
+ xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
+ xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
+ trace_xfs_log_grant_exit(log, tic);
+ xlog_verify_grant_tail(log);
+ return 0;
+
+error_return_unlocked:
+ spin_lock(&log->l_grant_reserve_lock);
+error_return:
+ list_del_init(&tic->t_queue);
+ spin_unlock(&log->l_grant_reserve_lock);
+ trace_xfs_log_grant_error(log, tic);
+
+ /*
+ * If we are failing, make sure the ticket doesn't have any
+ * current reservations. We don't want to add this back when
+ * the ticket/transaction gets cancelled.
+ */
+ tic->t_curr_res = 0;
+ tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+ return XFS_ERROR(EIO);
+} /* xlog_grant_log_space */
+
+
+/*
+ * Replenish the byte reservation required by moving the grant write head.
+ *
+ * Similar to xlog_grant_log_space, the function is structured to have a lock
+ * free fast path.
+ */
+STATIC int
+xlog_regrant_write_log_space(xlog_t *log,
+ xlog_ticket_t *tic)
+{
+ int free_bytes, need_bytes;
+
+ tic->t_curr_res = tic->t_unit_res;
+ xlog_tic_reset_res(tic);
+
+ if (tic->t_cnt > 0)
+ return 0;
+
+#ifdef DEBUG
+ if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+ panic("regrant Recovery problem");
+#endif
+
+ trace_xfs_log_regrant_write_enter(log, tic);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto error_return_unlocked;
+
+ /* If there are other waiters on the queue then give them a
+ * chance at logspace before us. Wake up the first waiters,
+ * if we do not wake up all the waiters then go to sleep waiting
+ * for more free space, otherwise try to get some space for
+ * this transaction.
+ */
+ need_bytes = tic->t_unit_res;
+ if (!list_empty_careful(&log->l_writeq)) {
+ struct xlog_ticket *ntic;
+
+ spin_lock(&log->l_grant_write_lock);
+ free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+ list_for_each_entry(ntic, &log->l_writeq, t_queue) {
+ ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
+
+ if (free_bytes < ntic->t_unit_res)
+ break;
+ free_bytes -= ntic->t_unit_res;
+ wake_up(&ntic->t_wait);
+ }
+
+ if (ntic != list_first_entry(&log->l_writeq,
+ struct xlog_ticket, t_queue)) {
+ if (list_empty(&tic->t_queue))
+ list_add_tail(&tic->t_queue, &log->l_writeq);
+ trace_xfs_log_regrant_write_sleep1(log, tic);
+
+ xlog_grant_push_ail(log, need_bytes);
+
+ XFS_STATS_INC(xs_sleep_logspace);
+ xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+ trace_xfs_log_regrant_write_wake1(log, tic);
+ } else
+ spin_unlock(&log->l_grant_write_lock);
+ }
+
+redo:
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto error_return_unlocked;
+
+ free_bytes = xlog_space_left(log, &log->l_grant_write_head);
+ if (free_bytes < need_bytes) {
+ spin_lock(&log->l_grant_write_lock);
+ if (list_empty(&tic->t_queue))
+ list_add_tail(&tic->t_queue, &log->l_writeq);
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto error_return;
+
+ xlog_grant_push_ail(log, need_bytes);
+
+ XFS_STATS_INC(xs_sleep_logspace);
+ trace_xfs_log_regrant_write_sleep2(log, tic);
+ xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+
+ trace_xfs_log_regrant_write_wake2(log, tic);
+ goto redo;
+ }
+
+ if (!list_empty(&tic->t_queue)) {
+ spin_lock(&log->l_grant_write_lock);
+ list_del_init(&tic->t_queue);
+ spin_unlock(&log->l_grant_write_lock);
+ }
+
+ /* we've got enough space */
+ xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
+ trace_xfs_log_regrant_write_exit(log, tic);
+ xlog_verify_grant_tail(log);
+ return 0;
+
+
+ error_return_unlocked:
+ spin_lock(&log->l_grant_write_lock);
+ error_return:
+ list_del_init(&tic->t_queue);
+ spin_unlock(&log->l_grant_write_lock);
+ trace_xfs_log_regrant_write_error(log, tic);
+
+ /*
+ * If we are failing, make sure the ticket doesn't have any
+ * current reservations. We don't want to add this back when
+ * the ticket/transaction gets cancelled.
+ */
+ tic->t_curr_res = 0;
+ tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+ return XFS_ERROR(EIO);
+} /* xlog_regrant_write_log_space */
+
+
+/* The first cnt-1 times through here we don't need to
+ * move the grant write head because the permanent
+ * reservation has reserved cnt times the unit amount.
+ * Release part of current permanent unit reservation and
+ * reset current reservation to be one units worth. Also
+ * move grant reservation head forward.
+ */
+STATIC void
+xlog_regrant_reserve_log_space(xlog_t *log,
+ xlog_ticket_t *ticket)
+{
+ trace_xfs_log_regrant_reserve_enter(log, ticket);
+
+ if (ticket->t_cnt > 0)
+ ticket->t_cnt--;
+
+ xlog_grant_sub_space(log, &log->l_grant_reserve_head,
+ ticket->t_curr_res);
+ xlog_grant_sub_space(log, &log->l_grant_write_head,
+ ticket->t_curr_res);
+ ticket->t_curr_res = ticket->t_unit_res;
+ xlog_tic_reset_res(ticket);
+
+ trace_xfs_log_regrant_reserve_sub(log, ticket);
+
+ /* just return if we still have some of the pre-reserved space */
+ if (ticket->t_cnt > 0)
+ return;
+
+ xlog_grant_add_space(log, &log->l_grant_reserve_head,
+ ticket->t_unit_res);
+
+ trace_xfs_log_regrant_reserve_exit(log, ticket);
+
+ ticket->t_curr_res = ticket->t_unit_res;
+ xlog_tic_reset_res(ticket);
+} /* xlog_regrant_reserve_log_space */
+
+
+/*
+ * Give back the space left from a reservation.
+ *
+ * All the information we need to make a correct determination of space left
+ * is present. For non-permanent reservations, things are quite easy. The
+ * count should have been decremented to zero. We only need to deal with the
+ * space remaining in the current reservation part of the ticket. If the
+ * ticket contains a permanent reservation, there may be left over space which
+ * needs to be released. A count of N means that N-1 refills of the current
+ * reservation can be done before we need to ask for more space. The first
+ * one goes to fill up the first current reservation. Once we run out of
+ * space, the count will stay at zero and the only space remaining will be
+ * in the current reservation field.
+ */
+STATIC void
+xlog_ungrant_log_space(xlog_t *log,
+ xlog_ticket_t *ticket)
+{
+ int bytes;
+
+ if (ticket->t_cnt > 0)
+ ticket->t_cnt--;
+
+ trace_xfs_log_ungrant_enter(log, ticket);
+ trace_xfs_log_ungrant_sub(log, ticket);
+
+ /*
+ * If this is a permanent reservation ticket, we may be able to free
+ * up more space based on the remaining count.
+ */
+ bytes = ticket->t_curr_res;
+ if (ticket->t_cnt > 0) {
+ ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
+ bytes += ticket->t_unit_res*ticket->t_cnt;
+ }
+
+ xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
+ xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
+
+ trace_xfs_log_ungrant_exit(log, ticket);
+
+ xfs_log_move_tail(log->l_mp, 1);
+} /* xlog_ungrant_log_space */
+
+
+/*
+ * Flush iclog to disk if this is the last reference to the given iclog and
+ * the WANT_SYNC bit is set.
+ *
+ * When this function is entered, the iclog is not necessarily in the
+ * WANT_SYNC state. It may be sitting around waiting to get filled.
+ *
+ *
+ */
+STATIC int
+xlog_state_release_iclog(
+ xlog_t *log,
+ xlog_in_core_t *iclog)
+{
+ int sync = 0; /* do we sync? */
+
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ return XFS_ERROR(EIO);
+
+ ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
+ if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
+ return 0;
+
+ if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ spin_unlock(&log->l_icloglock);
+ return XFS_ERROR(EIO);
+ }
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
+ iclog->ic_state == XLOG_STATE_WANT_SYNC);
+
+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+ /* update tail before writing to iclog */
+ xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+ sync++;
+ iclog->ic_state = XLOG_STATE_SYNCING;
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ xlog_verify_tail_lsn(log, iclog, tail_lsn);
+ /* cycle incremented when incrementing curr_block */
+ }
+ spin_unlock(&log->l_icloglock);
+
+ /*
+ * We let the log lock go, so it's possible that we hit a log I/O
+ * error or some other SHUTDOWN condition that marks the iclog
+ * as XLOG_STATE_IOERROR before the bwrite. However, we know that
+ * this iclog has consistent data, so we ignore IOERROR
+ * flags after this point.
+ */
+ if (sync)
+ return xlog_sync(log, iclog);
+ return 0;
+} /* xlog_state_release_iclog */
+
+
+/*
+ * This routine will mark the current iclog in the ring as WANT_SYNC
+ * and move the current iclog pointer to the next iclog in the ring.
+ * When this routine is called from xlog_state_get_iclog_space(), the
+ * exact size of the iclog has not yet been determined. All we know is
+ * that every data block. We have run out of space in this log record.
+ */
+STATIC void
+xlog_state_switch_iclogs(xlog_t *log,
+ xlog_in_core_t *iclog,
+ int eventual_size)
+{
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ if (!eventual_size)
+ eventual_size = iclog->ic_offset;
+ iclog->ic_state = XLOG_STATE_WANT_SYNC;
+ iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block);
+ log->l_prev_block = log->l_curr_block;
+ log->l_prev_cycle = log->l_curr_cycle;
+
+ /* roll log?: ic_offset changed later */
+ log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
+
+ /* Round up to next log-sunit */
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
+ log->l_mp->m_sb.sb_logsunit > 1) {
+ __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
+ log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
+ }
+
+ if (log->l_curr_block >= log->l_logBBsize) {
+ log->l_curr_cycle++;
+ if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
+ log->l_curr_cycle++;
+ log->l_curr_block -= log->l_logBBsize;
+ ASSERT(log->l_curr_block >= 0);
+ }
+ ASSERT(iclog == log->l_iclog);
+ log->l_iclog = iclog->ic_next;
+} /* xlog_state_switch_iclogs */
+
+/*
+ * Write out all data in the in-core log as of this exact moment in time.
+ *
+ * Data may be written to the in-core log during this call. However,
+ * we don't guarantee this data will be written out. A change from past
+ * implementation means this routine will *not* write out zero length LRs.
+ *
+ * Basically, we try and perform an intelligent scan of the in-core logs.
+ * If we determine there is no flushable data, we just return. There is no
+ * flushable data if:
+ *
+ * 1. the current iclog is active and has no data; the previous iclog
+ * is in the active or dirty state.
+ * 2. the current iclog is drity, and the previous iclog is in the
+ * active or dirty state.
+ *
+ * We may sleep if:
+ *
+ * 1. the current iclog is not in the active nor dirty state.
+ * 2. the current iclog dirty, and the previous iclog is not in the
+ * active nor dirty state.
+ * 3. the current iclog is active, and there is another thread writing
+ * to this particular iclog.
+ * 4. a) the current iclog is active and has no other writers
+ * b) when we return from flushing out this iclog, it is still
+ * not in the active nor dirty state.
+ */
+int
+_xfs_log_force(
+ struct xfs_mount *mp,
+ uint flags,
+ int *log_flushed)
+{
+ struct log *log = mp->m_log;
+ struct xlog_in_core *iclog;
+ xfs_lsn_t lsn;
+
+ XFS_STATS_INC(xs_log_force);
+
+ if (log->l_cilp)
+ xlog_cil_force(log);
+
+ spin_lock(&log->l_icloglock);
+
+ iclog = log->l_iclog;
+ if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ spin_unlock(&log->l_icloglock);
+ return XFS_ERROR(EIO);
+ }
+
+ /* If the head iclog is not active nor dirty, we just attach
+ * ourselves to the head and go to sleep.
+ */
+ if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+ iclog->ic_state == XLOG_STATE_DIRTY) {
+ /*
+ * If the head is dirty or (active and empty), then
+ * we need to look at the previous iclog. If the previous
+ * iclog is active or dirty we are done. There is nothing
+ * to sync out. Otherwise, we attach ourselves to the
+ * previous iclog and go to sleep.
+ */
+ if (iclog->ic_state == XLOG_STATE_DIRTY ||
+ (atomic_read(&iclog->ic_refcnt) == 0
+ && iclog->ic_offset == 0)) {
+ iclog = iclog->ic_prev;
+ if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+ iclog->ic_state == XLOG_STATE_DIRTY)
+ goto no_sleep;
+ else
+ goto maybe_sleep;
+ } else {
+ if (atomic_read(&iclog->ic_refcnt) == 0) {
+ /* We are the only one with access to this
+ * iclog. Flush it out now. There should
+ * be a roundoff of zero to show that someone
+ * has already taken care of the roundoff from
+ * the previous sync.
+ */
+ atomic_inc(&iclog->ic_refcnt);
+ lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ xlog_state_switch_iclogs(log, iclog, 0);
+ spin_unlock(&log->l_icloglock);
+
+ if (xlog_state_release_iclog(log, iclog))
+ return XFS_ERROR(EIO);
+
+ if (log_flushed)
+ *log_flushed = 1;
+ spin_lock(&log->l_icloglock);
+ if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
+ iclog->ic_state != XLOG_STATE_DIRTY)
+ goto maybe_sleep;
+ else
+ goto no_sleep;
+ } else {
+ /* Someone else is writing to this iclog.
+ * Use its call to flush out the data. However,
+ * the other thread may not force out this LR,
+ * so we mark it WANT_SYNC.
+ */
+ xlog_state_switch_iclogs(log, iclog, 0);
+ goto maybe_sleep;
+ }
+ }
+ }
+
+ /* By the time we come around again, the iclog could've been filled
+ * which would give it another lsn. If we have a new lsn, just
+ * return because the relevant data has been flushed.
+ */
+maybe_sleep:
+ if (flags & XFS_LOG_SYNC) {
+ /*
+ * We must check if we're shutting down here, before
+ * we wait, while we're holding the l_icloglock.
+ * Then we check again after waking up, in case our
+ * sleep was disturbed by a bad news.
+ */
+ if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ spin_unlock(&log->l_icloglock);
+ return XFS_ERROR(EIO);
+ }
+ XFS_STATS_INC(xs_log_force_sleep);
+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+ /*
+ * No need to grab the log lock here since we're
+ * only deciding whether or not to return EIO
+ * and the memory read should be atomic.
+ */
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ return XFS_ERROR(EIO);
+ if (log_flushed)
+ *log_flushed = 1;
+ } else {
+
+no_sleep:
+ spin_unlock(&log->l_icloglock);
+ }
+ return 0;
+}
+
+/*
+ * Wrapper for _xfs_log_force(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force(
+ xfs_mount_t *mp,
+ uint flags)
+{
+ int error;
+
+ error = _xfs_log_force(mp, flags, NULL);
+ if (error)
+ xfs_warn(mp, "%s: error %d returned.", __func__, error);
+}
+
+/*
+ * Force the in-core log to disk for a specific LSN.
+ *
+ * Find in-core log with lsn.
+ * If it is in the DIRTY state, just return.
+ * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
+ * state and go to sleep or return.
+ * If it is in any other state, go to sleep or return.
+ *
+ * Synchronous forces are implemented with a signal variable. All callers
+ * to force a given lsn to disk will wait on a the sv attached to the
+ * specific in-core log. When given in-core log finally completes its
+ * write to disk, that thread will wake up all threads waiting on the
+ * sv.
+ */
+int
+_xfs_log_force_lsn(
+ struct xfs_mount *mp,
+ xfs_lsn_t lsn,
+ uint flags,
+ int *log_flushed)
+{
+ struct log *log = mp->m_log;
+ struct xlog_in_core *iclog;
+ int already_slept = 0;
+
+ ASSERT(lsn != 0);
+
+ XFS_STATS_INC(xs_log_force);
+
+ if (log->l_cilp) {
+ lsn = xlog_cil_force_lsn(log, lsn);
+ if (lsn == NULLCOMMITLSN)
+ return 0;
+ }
+
+try_again:
+ spin_lock(&log->l_icloglock);
+ iclog = log->l_iclog;
+ if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ spin_unlock(&log->l_icloglock);
+ return XFS_ERROR(EIO);
+ }
+
+ do {
+ if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
+ iclog = iclog->ic_next;
+ continue;
+ }
+
+ if (iclog->ic_state == XLOG_STATE_DIRTY) {
+ spin_unlock(&log->l_icloglock);
+ return 0;
+ }
+
+ if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+ /*
+ * We sleep here if we haven't already slept (e.g.
+ * this is the first time we've looked at the correct
+ * iclog buf) and the buffer before us is going to
+ * be sync'ed. The reason for this is that if we
+ * are doing sync transactions here, by waiting for
+ * the previous I/O to complete, we can allow a few
+ * more transactions into this iclog before we close
+ * it down.
+ *
+ * Otherwise, we mark the buffer WANT_SYNC, and bump
+ * up the refcnt so we can release the log (which
+ * drops the ref count). The state switch keeps new
+ * transaction commits from using this buffer. When
+ * the current commits finish writing into the buffer,
+ * the refcount will drop to zero and the buffer will
+ * go out then.
+ */
+ if (!already_slept &&
+ (iclog->ic_prev->ic_state &
+ (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
+ ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
+
+ XFS_STATS_INC(xs_log_force_sleep);
+
+ xlog_wait(&iclog->ic_prev->ic_write_wait,
+ &log->l_icloglock);
+ if (log_flushed)
+ *log_flushed = 1;
+ already_slept = 1;
+ goto try_again;
+ }
+ atomic_inc(&iclog->ic_refcnt);
+ xlog_state_switch_iclogs(log, iclog, 0);
+ spin_unlock(&log->l_icloglock);
+ if (xlog_state_release_iclog(log, iclog))
+ return XFS_ERROR(EIO);
+ if (log_flushed)
+ *log_flushed = 1;
+ spin_lock(&log->l_icloglock);
+ }
+
+ if ((flags & XFS_LOG_SYNC) && /* sleep */
+ !(iclog->ic_state &
+ (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+ /*
+ * Don't wait on completion if we know that we've
+ * gotten a log write error.
+ */
+ if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ spin_unlock(&log->l_icloglock);
+ return XFS_ERROR(EIO);
+ }
+ XFS_STATS_INC(xs_log_force_sleep);
+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+ /*
+ * No need to grab the log lock here since we're
+ * only deciding whether or not to return EIO
+ * and the memory read should be atomic.
+ */
+ if (iclog->ic_state & XLOG_STATE_IOERROR)
+ return XFS_ERROR(EIO);
+
+ if (log_flushed)
+ *log_flushed = 1;
+ } else { /* just return */
+ spin_unlock(&log->l_icloglock);
+ }
+
+ return 0;
+ } while (iclog != log->l_iclog);
+
+ spin_unlock(&log->l_icloglock);
+ return 0;
+}
+
+/*
+ * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force_lsn(
+ xfs_mount_t *mp,
+ xfs_lsn_t lsn,
+ uint flags)
+{
+ int error;
+
+ error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
+ if (error)
+ xfs_warn(mp, "%s: error %d returned.", __func__, error);
+}
+
+/*
+ * Called when we want to mark the current iclog as being ready to sync to
+ * disk.
+ */
+STATIC void
+xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
+{
+ assert_spin_locked(&log->l_icloglock);
+
+ if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+ xlog_state_switch_iclogs(log, iclog, 0);
+ } else {
+ ASSERT(iclog->ic_state &
+ (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
+ }
+}
+
+
+/*****************************************************************************
+ *
+ * TICKET functions
+ *
+ *****************************************************************************
+ */
+
+/*
+ * Free a used ticket when its refcount falls to zero.
+ */
+void
+xfs_log_ticket_put(
+ xlog_ticket_t *ticket)
+{
+ ASSERT(atomic_read(&ticket->t_ref) > 0);
+ if (atomic_dec_and_test(&ticket->t_ref))
+ kmem_zone_free(xfs_log_ticket_zone, ticket);
+}
+
+xlog_ticket_t *
+xfs_log_ticket_get(
+ xlog_ticket_t *ticket)
+{
+ ASSERT(atomic_read(&ticket->t_ref) > 0);
+ atomic_inc(&ticket->t_ref);
+ return ticket;
+}
+
+/*
+ * Allocate and initialise a new log ticket.
+ */
+xlog_ticket_t *
+xlog_ticket_alloc(
+ struct log *log,
+ int unit_bytes,
+ int cnt,
+ char client,
+ uint xflags,
+ int alloc_flags)
+{
+ struct xlog_ticket *tic;
+ uint num_headers;
+ int iclog_space;
+
+ tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
+ if (!tic)
+ return NULL;
+
+ /*
+ * Permanent reservations have up to 'cnt'-1 active log operations
+ * in the log. A unit in this case is the amount of space for one
+ * of these log operations. Normal reservations have a cnt of 1
+ * and their unit amount is the total amount of space required.
+ *
+ * The following lines of code account for non-transaction data
+ * which occupy space in the on-disk log.
+ *
+ * Normal form of a transaction is:
+ * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
+ * and then there are LR hdrs, split-recs and roundoff at end of syncs.
+ *
+ * We need to account for all the leadup data and trailer data
+ * around the transaction data.
+ * And then we need to account for the worst case in terms of using
+ * more space.
+ * The worst case will happen if:
+ * - the placement of the transaction happens to be such that the
+ * roundoff is at its maximum
+ * - the transaction data is synced before the commit record is synced
+ * i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
+ * Therefore the commit record is in its own Log Record.
+ * This can happen as the commit record is called with its
+ * own region to xlog_write().
+ * This then means that in the worst case, roundoff can happen for
+ * the commit-rec as well.
+ * The commit-rec is smaller than padding in this scenario and so it is
+ * not added separately.
+ */
+
+ /* for trans header */
+ unit_bytes += sizeof(xlog_op_header_t);
+ unit_bytes += sizeof(xfs_trans_header_t);
+
+ /* for start-rec */
+ unit_bytes += sizeof(xlog_op_header_t);
+
+ /*
+ * for LR headers - the space for data in an iclog is the size minus
+ * the space used for the headers. If we use the iclog size, then we
+ * undercalculate the number of headers required.
+ *
+ * Furthermore - the addition of op headers for split-recs might
+ * increase the space required enough to require more log and op
+ * headers, so take that into account too.
+ *
+ * IMPORTANT: This reservation makes the assumption that if this
+ * transaction is the first in an iclog and hence has the LR headers
+ * accounted to it, then the remaining space in the iclog is
+ * exclusively for this transaction. i.e. if the transaction is larger
+ * than the iclog, it will be the only thing in that iclog.
+ * Fundamentally, this means we must pass the entire log vector to
+ * xlog_write to guarantee this.
+ */
+ iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+ num_headers = howmany(unit_bytes, iclog_space);
+
+ /* for split-recs - ophdrs added when data split over LRs */
+ unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+
+ /* add extra header reservations if we overrun */
+ while (!num_headers ||
+ howmany(unit_bytes, iclog_space) > num_headers) {
+ unit_bytes += sizeof(xlog_op_header_t);
+ num_headers++;
+ }
+ unit_bytes += log->l_iclog_hsize * num_headers;
+
+ /* for commit-rec LR header - note: padding will subsume the ophdr */
+ unit_bytes += log->l_iclog_hsize;
+
+ /* for roundoff padding for transaction data and one for commit record */
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
+ log->l_mp->m_sb.sb_logsunit > 1) {
+ /* log su roundoff */
+ unit_bytes += 2*log->l_mp->m_sb.sb_logsunit;
+ } else {
+ /* BB roundoff */
+ unit_bytes += 2*BBSIZE;
+ }
+
+ atomic_set(&tic->t_ref, 1);
+ INIT_LIST_HEAD(&tic->t_queue);
+ tic->t_unit_res = unit_bytes;
+ tic->t_curr_res = unit_bytes;
+ tic->t_cnt = cnt;
+ tic->t_ocnt = cnt;
+ tic->t_tid = random32();
+ tic->t_clientid = client;
+ tic->t_flags = XLOG_TIC_INITED;
+ tic->t_trans_type = 0;
+ if (xflags & XFS_LOG_PERM_RESERV)
+ tic->t_flags |= XLOG_TIC_PERM_RESERV;
+ init_waitqueue_head(&tic->t_wait);
+
+ xlog_tic_reset_res(tic);
+
+ return tic;
+}
+
+
+/******************************************************************************
+ *
+ * Log debug routines
+ *
+ ******************************************************************************
+ */
+#if defined(DEBUG)
+/*
+ * Make sure that the destination ptr is within the valid data region of
+ * one of the iclogs. This uses backup pointers stored in a different
+ * part of the log in case we trash the log structure.
+ */
+void
+xlog_verify_dest_ptr(
+ struct log *log,
+ char *ptr)
+{
+ int i;
+ int good_ptr = 0;
+
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ if (ptr >= log->l_iclog_bak[i] &&
+ ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
+ good_ptr++;
+ }
+
+ if (!good_ptr)
+ xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
+}
+
+/*
+ * Check to make sure the grant write head didn't just over lap the tail. If
+ * the cycles are the same, we can't be overlapping. Otherwise, make sure that
+ * the cycles differ by exactly one and check the byte count.
+ *
+ * This check is run unlocked, so can give false positives. Rather than assert
+ * on failures, use a warn-once flag and a panic tag to allow the admin to
+ * determine if they want to panic the machine when such an error occurs. For
+ * debug kernels this will have the same effect as using an assert but, unlinke
+ * an assert, it can be turned off at runtime.
+ */
+STATIC void
+xlog_verify_grant_tail(
+ struct log *log)
+{
+ int tail_cycle, tail_blocks;
+ int cycle, space;
+
+ xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+ xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
+ if (tail_cycle != cycle) {
+ if (cycle - 1 != tail_cycle &&
+ !(log->l_flags & XLOG_TAIL_WARN)) {
+ xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+ "%s: cycle - 1 != tail_cycle", __func__);
+ log->l_flags |= XLOG_TAIL_WARN;
+ }
+
+ if (space > BBTOB(tail_blocks) &&
+ !(log->l_flags & XLOG_TAIL_WARN)) {
+ xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+ "%s: space > BBTOB(tail_blocks)", __func__);
+ log->l_flags |= XLOG_TAIL_WARN;
+ }
+ }
+}
+
+/* check if it will fit */
+STATIC void
+xlog_verify_tail_lsn(xlog_t *log,
+ xlog_in_core_t *iclog,
+ xfs_lsn_t tail_lsn)
+{
+ int blocks;
+
+ if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
+ blocks =
+ log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
+ if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
+ xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
+ } else {
+ ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
+
+ if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
+ xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
+
+ blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
+ if (blocks < BTOBB(iclog->ic_offset) + 1)
+ xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
+ }
+} /* xlog_verify_tail_lsn */
+
+/*
+ * Perform a number of checks on the iclog before writing to disk.
+ *
+ * 1. Make sure the iclogs are still circular
+ * 2. Make sure we have a good magic number
+ * 3. Make sure we don't have magic numbers in the data
+ * 4. Check fields of each log operation header for:
+ * A. Valid client identifier
+ * B. tid ptr value falls in valid ptr space (user space code)
+ * C. Length in log record header is correct according to the
+ * individual operation headers within record.
+ * 5. When a bwrite will occur within 5 blocks of the front of the physical
+ * log, check the preceding blocks of the physical log to make sure all
+ * the cycle numbers agree with the current cycle number.
+ */
+STATIC void
+xlog_verify_iclog(xlog_t *log,
+ xlog_in_core_t *iclog,
+ int count,
+ boolean_t syncing)
+{
+ xlog_op_header_t *ophead;
+ xlog_in_core_t *icptr;
+ xlog_in_core_2_t *xhdr;
+ xfs_caddr_t ptr;
+ xfs_caddr_t base_ptr;
+ __psint_t field_offset;
+ __uint8_t clientid;
+ int len, i, j, k, op_len;
+ int idx;
+
+ /* check validity of iclog pointers */
+ spin_lock(&log->l_icloglock);
+ icptr = log->l_iclog;
+ for (i=0; i < log->l_iclog_bufs; i++) {
+ if (icptr == NULL)
+ xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
+ icptr = icptr->ic_next;
+ }
+ if (icptr != log->l_iclog)
+ xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
+ spin_unlock(&log->l_icloglock);
+
+ /* check log magic numbers */
+ if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM)
+ xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
+
+ ptr = (xfs_caddr_t) &iclog->ic_header;
+ for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
+ ptr += BBSIZE) {
+ if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+ xfs_emerg(log->l_mp, "%s: unexpected magic num",
+ __func__);
+ }
+
+ /* check fields */
+ len = be32_to_cpu(iclog->ic_header.h_num_logops);
+ ptr = iclog->ic_datap;
+ base_ptr = ptr;
+ ophead = (xlog_op_header_t *)ptr;
+ xhdr = iclog->ic_data;
+ for (i = 0; i < len; i++) {
+ ophead = (xlog_op_header_t *)ptr;
+
+ /* clientid is only 1 byte */
+ field_offset = (__psint_t)
+ ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+ if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+ clientid = ophead->oh_clientid;
+ } else {
+ idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+ if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+ j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ clientid = xlog_get_client_id(
+ xhdr[j].hic_xheader.xh_cycle_data[k]);
+ } else {
+ clientid = xlog_get_client_id(
+ iclog->ic_header.h_cycle_data[idx]);
+ }
+ }
+ if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+ xfs_warn(log->l_mp,
+ "%s: invalid clientid %d op 0x%p offset 0x%lx",
+ __func__, clientid, ophead,
+ (unsigned long)field_offset);
+
+ /* check length */
+ field_offset = (__psint_t)
+ ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+ if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+ op_len = be32_to_cpu(ophead->oh_len);
+ } else {
+ idx = BTOBBT((__psint_t)&ophead->oh_len -
+ (__psint_t)iclog->ic_datap);
+ if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+ j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]);
+ } else {
+ op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
+ }
+ }
+ ptr += sizeof(xlog_op_header_t) + op_len;
+ }
+} /* xlog_verify_iclog */
+#endif
+
+/*
+ * Mark all iclogs IOERROR. l_icloglock is held by the caller.
+ */
+STATIC int
+xlog_state_ioerror(
+ xlog_t *log)
+{
+ xlog_in_core_t *iclog, *ic;
+
+ iclog = log->l_iclog;
+ if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
+ /*
+ * Mark all the incore logs IOERROR.
+ * From now on, no log flushes will result.
+ */
+ ic = iclog;
+ do {
+ ic->ic_state = XLOG_STATE_IOERROR;
+ ic = ic->ic_next;
+ } while (ic != iclog);
+ return 0;
+ }
+ /*
+ * Return non-zero, if state transition has already happened.
+ */
+ return 1;
+}
+
+/*
+ * This is called from xfs_force_shutdown, when we're forcibly
+ * shutting down the filesystem, typically because of an IO error.
+ * Our main objectives here are to make sure that:
+ * a. the filesystem gets marked 'SHUTDOWN' for all interested
+ * parties to find out, 'atomically'.
+ * b. those who're sleeping on log reservations, pinned objects and
+ * other resources get woken up, and be told the bad news.
+ * c. nothing new gets queued up after (a) and (b) are done.
+ * d. if !logerror, flush the iclogs to disk, then seal them off
+ * for business.
+ *
+ * Note: for delayed logging the !logerror case needs to flush the regions
+ * held in memory out to the iclogs before flushing them to disk. This needs
+ * to be done before the log is marked as shutdown, otherwise the flush to the
+ * iclogs will fail.
+ */
+int
+xfs_log_force_umount(
+ struct xfs_mount *mp,
+ int logerror)
+{
+ xlog_ticket_t *tic;
+ xlog_t *log;
+ int retval;
+
+ log = mp->m_log;
+
+ /*
+ * If this happens during log recovery, don't worry about
+ * locking; the log isn't open for business yet.
+ */
+ if (!log ||
+ log->l_flags & XLOG_ACTIVE_RECOVERY) {
+ mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+ if (mp->m_sb_bp)
+ XFS_BUF_DONE(mp->m_sb_bp);
+ return 0;
+ }
+
+ /*
+ * Somebody could've already done the hard work for us.
+ * No need to get locks for this.
+ */
+ if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
+ ASSERT(XLOG_FORCED_SHUTDOWN(log));
+ return 1;
+ }
+ retval = 0;
+
+ /*
+ * Flush the in memory commit item list before marking the log as
+ * being shut down. We need to do it in this order to ensure all the
+ * completed transactions are flushed to disk with the xfs_log_force()
+ * call below.
+ */
+ if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
+ xlog_cil_force(log);
+
+ /*
+ * mark the filesystem and the as in a shutdown state and wake
+ * everybody up to tell them the bad news.
+ */
+ spin_lock(&log->l_icloglock);
+ mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+ if (mp->m_sb_bp)
+ XFS_BUF_DONE(mp->m_sb_bp);
+
+ /*
+ * This flag is sort of redundant because of the mount flag, but
+ * it's good to maintain the separation between the log and the rest
+ * of XFS.
+ */
+ log->l_flags |= XLOG_IO_ERROR;
+
+ /*
+ * If we hit a log error, we want to mark all the iclogs IOERROR
+ * while we're still holding the loglock.
+ */
+ if (logerror)
+ retval = xlog_state_ioerror(log);
+ spin_unlock(&log->l_icloglock);
+
+ /*
+ * We don't want anybody waiting for log reservations after this. That
+ * means we have to wake up everybody queued up on reserveq as well as
+ * writeq. In addition, we make sure in xlog_{re}grant_log_space that
+ * we don't enqueue anything once the SHUTDOWN flag is set, and this
+ * action is protected by the grant locks.
+ */
+ spin_lock(&log->l_grant_reserve_lock);
+ list_for_each_entry(tic, &log->l_reserveq, t_queue)
+ wake_up(&tic->t_wait);
+ spin_unlock(&log->l_grant_reserve_lock);
+
+ spin_lock(&log->l_grant_write_lock);
+ list_for_each_entry(tic, &log->l_writeq, t_queue)
+ wake_up(&tic->t_wait);
+ spin_unlock(&log->l_grant_write_lock);
+
+ if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
+ ASSERT(!logerror);
+ /*
+ * Force the incore logs to disk before shutting the
+ * log down completely.
+ */
+ _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+
+ spin_lock(&log->l_icloglock);
+ retval = xlog_state_ioerror(log);
+ spin_unlock(&log->l_icloglock);
+ }
+ /*
+ * Wake up everybody waiting on xfs_log_force.
+ * Callback all log item committed functions as if the
+ * log writes were completed.
+ */
+ xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
+
+#ifdef XFSERRORDEBUG
+ {
+ xlog_in_core_t *iclog;
+
+ spin_lock(&log->l_icloglock);
+ iclog = log->l_iclog;
+ do {
+ ASSERT(iclog->ic_callback == 0);
+ iclog = iclog->ic_next;
+ } while (iclog != log->l_iclog);
+ spin_unlock(&log->l_icloglock);
+ }
+#endif
+ /* return non-zero if log IOERROR transition had already happened */
+ return retval;
+}
+
+STATIC int
+xlog_iclogs_empty(xlog_t *log)
+{
+ xlog_in_core_t *iclog;
+
+ iclog = log->l_iclog;
+ do {
+ /* endianness does not matter here, zero is zero in
+ * any language.
+ */
+ if (iclog->ic_header.h_num_logops)
+ return 0;
+ iclog = iclog->ic_next;
+ } while (iclog != log->l_iclog);
+ return 1;
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
new file mode 100644
index 00000000..78c90399
--- /dev/null
+++ b/fs/xfs/xfs_log.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_LOG_H__
+#define __XFS_LOG_H__
+
+/* get lsn fields */
+#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
+#define BLOCK_LSN(lsn) ((uint)(lsn))
+
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+
+#ifdef __KERNEL__
+/*
+ * By comparing each component, we don't have to worry about extra
+ * endian issues in treating two 32 bit numbers as one 64 bit number
+ */
+static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
+{
+ if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
+ return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
+
+ if (BLOCK_LSN(lsn1) != BLOCK_LSN(lsn2))
+ return (BLOCK_LSN(lsn1)<BLOCK_LSN(lsn2))? -999 : 999;
+
+ return 0;
+}
+
+#define XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
+
+/*
+ * Macros, structures, prototypes for interface to the log manager.
+ */
+
+/*
+ * Flags to xfs_log_done()
+ */
+#define XFS_LOG_REL_PERM_RESERV 0x1
+
+/*
+ * Flags to xfs_log_reserve()
+ *
+ * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are
+ * performed against this type of reservation, the reservation
+ * is not decreased. Long running transactions should use this.
+ */
+#define XFS_LOG_PERM_RESERV 0x2
+
+/*
+ * Flags to xfs_log_force()
+ *
+ * XFS_LOG_SYNC: Synchronous force in-core log to disk
+ */
+#define XFS_LOG_SYNC 0x1
+
+#endif /* __KERNEL__ */
+
+
+/* Log Clients */
+#define XFS_TRANSACTION 0x69
+#define XFS_VOLUME 0x2
+#define XFS_LOG 0xaa
+
+
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT 1
+#define XLOG_REG_TYPE_BCHUNK 2
+#define XLOG_REG_TYPE_EFI_FORMAT 3
+#define XLOG_REG_TYPE_EFD_FORMAT 4
+#define XLOG_REG_TYPE_IFORMAT 5
+#define XLOG_REG_TYPE_ICORE 6
+#define XLOG_REG_TYPE_IEXT 7
+#define XLOG_REG_TYPE_IBROOT 8
+#define XLOG_REG_TYPE_ILOCAL 9
+#define XLOG_REG_TYPE_IATTR_EXT 10
+#define XLOG_REG_TYPE_IATTR_BROOT 11
+#define XLOG_REG_TYPE_IATTR_LOCAL 12
+#define XLOG_REG_TYPE_QFORMAT 13
+#define XLOG_REG_TYPE_DQUOT 14
+#define XLOG_REG_TYPE_QUOTAOFF 15
+#define XLOG_REG_TYPE_LRHEADER 16
+#define XLOG_REG_TYPE_UNMOUNT 17
+#define XLOG_REG_TYPE_COMMIT 18
+#define XLOG_REG_TYPE_TRANSHDR 19
+#define XLOG_REG_TYPE_MAX 19
+
+typedef struct xfs_log_iovec {
+ void *i_addr; /* beginning address of region */
+ int i_len; /* length in bytes of region */
+ uint i_type; /* type of region */
+} xfs_log_iovec_t;
+
+struct xfs_log_vec {
+ struct xfs_log_vec *lv_next; /* next lv in build list */
+ int lv_niovecs; /* number of iovecs in lv */
+ struct xfs_log_iovec *lv_iovecp; /* iovec array */
+ struct xfs_log_item *lv_item; /* owner */
+ char *lv_buf; /* formatted buffer */
+ int lv_buf_len; /* size of formatted buffer */
+};
+
+/*
+ * Structure used to pass callback function and the function's argument
+ * to the log manager.
+ */
+typedef struct xfs_log_callback {
+ struct xfs_log_callback *cb_next;
+ void (*cb_func)(void *, int);
+ void *cb_arg;
+} xfs_log_callback_t;
+
+
+#ifdef __KERNEL__
+/* Log manager interfaces */
+struct xfs_mount;
+struct xlog_in_core;
+struct xlog_ticket;
+struct xfs_log_item;
+struct xfs_item_ops;
+struct xfs_trans;
+
+void xfs_log_item_init(struct xfs_mount *mp,
+ struct xfs_log_item *item,
+ int type,
+ struct xfs_item_ops *ops);
+
+xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
+ struct xlog_ticket *ticket,
+ struct xlog_in_core **iclog,
+ uint flags);
+int _xfs_log_force(struct xfs_mount *mp,
+ uint flags,
+ int *log_forced);
+void xfs_log_force(struct xfs_mount *mp,
+ uint flags);
+int _xfs_log_force_lsn(struct xfs_mount *mp,
+ xfs_lsn_t lsn,
+ uint flags,
+ int *log_forced);
+void xfs_log_force_lsn(struct xfs_mount *mp,
+ xfs_lsn_t lsn,
+ uint flags);
+int xfs_log_mount(struct xfs_mount *mp,
+ struct xfs_buftarg *log_target,
+ xfs_daddr_t start_block,
+ int num_bblocks);
+int xfs_log_mount_finish(struct xfs_mount *mp);
+void xfs_log_move_tail(struct xfs_mount *mp,
+ xfs_lsn_t tail_lsn);
+int xfs_log_notify(struct xfs_mount *mp,
+ struct xlog_in_core *iclog,
+ xfs_log_callback_t *callback_entry);
+int xfs_log_release_iclog(struct xfs_mount *mp,
+ struct xlog_in_core *iclog);
+int xfs_log_reserve(struct xfs_mount *mp,
+ int length,
+ int count,
+ struct xlog_ticket **ticket,
+ __uint8_t clientid,
+ uint flags,
+ uint t_type);
+int xfs_log_write(struct xfs_mount *mp,
+ xfs_log_iovec_t region[],
+ int nentries,
+ struct xlog_ticket *ticket,
+ xfs_lsn_t *start_lsn);
+int xfs_log_unmount_write(struct xfs_mount *mp);
+void xfs_log_unmount(struct xfs_mount *mp);
+int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
+int xfs_log_need_covered(struct xfs_mount *mp);
+
+void xlog_iodone(struct xfs_buf *);
+
+struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
+void xfs_log_ticket_put(struct xlog_ticket *ticket);
+
+void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct xfs_log_vec *log_vector,
+ xfs_lsn_t *commit_lsn, int flags);
+bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+
+#endif
+#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 00000000..c7755d5a
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,804 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+#include "xfs_discard.h"
+
+/*
+ * Perform initial CIL structure initialisation. If the CIL is not
+ * enabled in this filesystem, ensure the log->l_cilp is null so
+ * we can check this conditional to determine if we are doing delayed
+ * logging or not.
+ */
+int
+xlog_cil_init(
+ struct log *log)
+{
+ struct xfs_cil *cil;
+ struct xfs_cil_ctx *ctx;
+
+ log->l_cilp = NULL;
+ if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
+ return 0;
+
+ cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+ if (!cil)
+ return ENOMEM;
+
+ ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+ if (!ctx) {
+ kmem_free(cil);
+ return ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&cil->xc_cil);
+ INIT_LIST_HEAD(&cil->xc_committing);
+ spin_lock_init(&cil->xc_cil_lock);
+ init_rwsem(&cil->xc_ctx_lock);
+ init_waitqueue_head(&cil->xc_commit_wait);
+
+ INIT_LIST_HEAD(&ctx->committing);
+ INIT_LIST_HEAD(&ctx->busy_extents);
+ ctx->sequence = 1;
+ ctx->cil = cil;
+ cil->xc_ctx = ctx;
+ cil->xc_current_sequence = ctx->sequence;
+
+ cil->xc_log = log;
+ log->l_cilp = cil;
+ return 0;
+}
+
+void
+xlog_cil_destroy(
+ struct log *log)
+{
+ if (!log->l_cilp)
+ return;
+
+ if (log->l_cilp->xc_ctx) {
+ if (log->l_cilp->xc_ctx->ticket)
+ xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+ kmem_free(log->l_cilp->xc_ctx);
+ }
+
+ ASSERT(list_empty(&log->l_cilp->xc_cil));
+ kmem_free(log->l_cilp);
+}
+
+/*
+ * Allocate a new ticket. Failing to get a new ticket makes it really hard to
+ * recover, so we don't allow failure here. Also, we allocate in a context that
+ * we don't want to be issuing transactions from, so we need to tell the
+ * allocation code this as well.
+ *
+ * We don't reserve any space for the ticket - we are going to steal whatever
+ * space we require from transactions as they commit. To ensure we reserve all
+ * the space required, we need to set the current reservation of the ticket to
+ * zero so that we know to steal the initial transaction overhead from the
+ * first transaction commit.
+ */
+static struct xlog_ticket *
+xlog_cil_ticket_alloc(
+ struct log *log)
+{
+ struct xlog_ticket *tic;
+
+ tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
+ KM_SLEEP|KM_NOFS);
+ tic->t_trans_type = XFS_TRANS_CHECKPOINT;
+
+ /*
+ * set the current reservation to zero so we know to steal the basic
+ * transaction overhead reservation from the first transaction commit.
+ */
+ tic->t_curr_res = 0;
+ return tic;
+}
+
+/*
+ * After the first stage of log recovery is done, we know where the head and
+ * tail of the log are. We need this log initialisation done before we can
+ * initialise the first CIL checkpoint context.
+ *
+ * Here we allocate a log ticket to track space usage during a CIL push. This
+ * ticket is passed to xlog_write() directly so that we don't slowly leak log
+ * space by failing to account for space used by log headers and additional
+ * region headers for split regions.
+ */
+void
+xlog_cil_init_post_recovery(
+ struct log *log)
+{
+ if (!log->l_cilp)
+ return;
+
+ log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
+ log->l_cilp->xc_ctx->sequence = 1;
+ log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
+ log->l_curr_block);
+}
+
+/*
+ * Format log item into a flat buffers
+ *
+ * For delayed logging, we need to hold a formatted buffer containing all the
+ * changes on the log item. This enables us to relog the item in memory and
+ * write it out asynchronously without needing to relock the object that was
+ * modified at the time it gets written into the iclog.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and formats the vector for the item into the buffer.
+ * The buffer is then attached to the log item are then inserted into the
+ * Committed Item List for tracking until the next checkpoint is written out.
+ *
+ * We don't set up region headers during this process; we simply copy the
+ * regions into the flat buffer. We can do this because we still have to do a
+ * formatting step to write the regions into the iclog buffer. Writing the
+ * ophdrs during the iclog write means that we can support splitting large
+ * regions across iclog boundares without needing a change in the format of the
+ * item/region encapsulation.
+ *
+ * Hence what we need to do now is change the rewrite the vector array to point
+ * to the copied region inside the buffer we just allocated. This allows us to
+ * format the regions into the iclog as though they are being formatted
+ * directly out of the objects themselves.
+ */
+static void
+xlog_cil_format_items(
+ struct log *log,
+ struct xfs_log_vec *log_vector)
+{
+ struct xfs_log_vec *lv;
+
+ ASSERT(log_vector);
+ for (lv = log_vector; lv; lv = lv->lv_next) {
+ void *ptr;
+ int index;
+ int len = 0;
+
+ /* build the vector array and calculate it's length */
+ IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
+ for (index = 0; index < lv->lv_niovecs; index++)
+ len += lv->lv_iovecp[index].i_len;
+
+ lv->lv_buf_len = len;
+ lv->lv_buf = kmem_alloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+ ptr = lv->lv_buf;
+
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+
+ memcpy(ptr, vec->i_addr, vec->i_len);
+ vec->i_addr = ptr;
+ ptr += vec->i_len;
+ }
+ ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+ }
+}
+
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+ struct log *log,
+ struct xfs_log_vec *lv,
+ int *len,
+ int *diff_iovecs)
+{
+ struct xfs_log_vec *old = lv->lv_item->li_lv;
+
+ if (old) {
+ /* existing lv on log item, space used is a delta */
+ ASSERT(!list_empty(&lv->lv_item->li_cil));
+ ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+
+ *len += lv->lv_buf_len - old->lv_buf_len;
+ *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
+ kmem_free(old->lv_buf);
+ kmem_free(old);
+ } else {
+ /* new lv, must pin the log item */
+ ASSERT(!lv->lv_item->li_lv);
+ ASSERT(list_empty(&lv->lv_item->li_cil));
+
+ *len += lv->lv_buf_len;
+ *diff_iovecs += lv->lv_niovecs;
+ IOP_PIN(lv->lv_item);
+
+ }
+
+ /* attach new log vector to log item */
+ lv->lv_item->li_lv = lv;
+
+ /*
+ * If this is the first time the item is being committed to the
+ * CIL, store the sequence number on the log item so we can
+ * tell in future commits whether this is the first checkpoint
+ * the item is being committed into.
+ */
+ if (!lv->lv_item->li_seq)
+ lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+
+/*
+ * Insert the log items into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ */
+static void
+xlog_cil_insert_items(
+ struct log *log,
+ struct xfs_log_vec *log_vector,
+ struct xlog_ticket *ticket)
+{
+ struct xfs_cil *cil = log->l_cilp;
+ struct xfs_cil_ctx *ctx = cil->xc_ctx;
+ struct xfs_log_vec *lv;
+ int len = 0;
+ int diff_iovecs = 0;
+ int iclog_space;
+
+ ASSERT(log_vector);
+
+ /*
+ * Do all the accounting aggregation and switching of log vectors
+ * around in a separate loop to the insertion of items into the CIL.
+ * Then we can do a separate loop to update the CIL within a single
+ * lock/unlock pair. This reduces the number of round trips on the CIL
+ * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
+ * hold time for the transaction commit.
+ *
+ * If this is the first time the item is being placed into the CIL in
+ * this context, pin it so it can't be written to disk until the CIL is
+ * flushed to the iclog and the iclog written to disk.
+ *
+ * We can do this safely because the context can't checkpoint until we
+ * are done so it doesn't matter exactly how we update the CIL.
+ */
+ for (lv = log_vector; lv; lv = lv->lv_next)
+ xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+
+ /* account for space used by new iovec headers */
+ len += diff_iovecs * sizeof(xlog_op_header_t);
+
+ spin_lock(&cil->xc_cil_lock);
+
+ /* move the items to the tail of the CIL */
+ for (lv = log_vector; lv; lv = lv->lv_next)
+ list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+
+ ctx->nvecs += diff_iovecs;
+
+ /*
+ * Now transfer enough transaction reservation to the context ticket
+ * for the checkpoint. The context ticket is special - the unit
+ * reservation has to grow as well as the current reservation as we
+ * steal from tickets so we can correctly determine the space used
+ * during the transaction commit.
+ */
+ if (ctx->ticket->t_curr_res == 0) {
+ /* first commit in checkpoint, steal the header reservation */
+ ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+ ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+ ticket->t_curr_res -= ctx->ticket->t_unit_res;
+ }
+
+ /* do we need space for more log record headers? */
+ iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+ if (len > 0 && (ctx->space_used / iclog_space !=
+ (ctx->space_used + len) / iclog_space)) {
+ int hdrs;
+
+ hdrs = (len + iclog_space - 1) / iclog_space;
+ /* need to take into account split region headers, too */
+ hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+ ctx->ticket->t_unit_res += hdrs;
+ ctx->ticket->t_curr_res += hdrs;
+ ticket->t_curr_res -= hdrs;
+ ASSERT(ticket->t_curr_res >= len);
+ }
+ ticket->t_curr_res -= len;
+ ctx->space_used += len;
+
+ spin_unlock(&cil->xc_cil_lock);
+}
+
+static void
+xlog_cil_free_logvec(
+ struct xfs_log_vec *log_vector)
+{
+ struct xfs_log_vec *lv;
+
+ for (lv = log_vector; lv; ) {
+ struct xfs_log_vec *next = lv->lv_next;
+ kmem_free(lv->lv_buf);
+ kmem_free(lv);
+ lv = next;
+ }
+}
+
+/*
+ * Mark all items committed and clear busy extents. We free the log vector
+ * chains in a separate pass so that we unpin the log items as quickly as
+ * possible.
+ */
+static void
+xlog_cil_committed(
+ void *args,
+ int abort)
+{
+ struct xfs_cil_ctx *ctx = args;
+ struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
+
+ xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
+ ctx->start_lsn, abort);
+
+ xfs_alloc_busy_sort(&ctx->busy_extents);
+ xfs_alloc_busy_clear(mp, &ctx->busy_extents,
+ (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
+
+ spin_lock(&ctx->cil->xc_cil_lock);
+ list_del(&ctx->committing);
+ spin_unlock(&ctx->cil->xc_cil_lock);
+
+ xlog_cil_free_logvec(ctx->lv_chain);
+
+ if (!list_empty(&ctx->busy_extents)) {
+ ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+
+ xfs_discard_extents(mp, &ctx->busy_extents);
+ xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
+ }
+
+ kmem_free(ctx);
+}
+
+/*
+ * Push the Committed Item List to the log. If @push_seq flag is zero, then it
+ * is a background flush and so we can chose to ignore it. Otherwise, if the
+ * current sequence is the same as @push_seq we need to do a flush. If
+ * @push_seq is less than the current sequence, then it has already been
+ * flushed and we don't need to do anything - the caller will wait for it to
+ * complete if necessary.
+ *
+ * @push_seq is a value rather than a flag because that allows us to do an
+ * unlocked check of the sequence number for a match. Hence we can allows log
+ * forces to run racily and not issue pushes for the same sequence twice. If we
+ * get a race between multiple pushes for the same sequence they will block on
+ * the first one and then abort, hence avoiding needless pushes.
+ */
+STATIC int
+xlog_cil_push(
+ struct log *log,
+ xfs_lsn_t push_seq)
+{
+ struct xfs_cil *cil = log->l_cilp;
+ struct xfs_log_vec *lv;
+ struct xfs_cil_ctx *ctx;
+ struct xfs_cil_ctx *new_ctx;
+ struct xlog_in_core *commit_iclog;
+ struct xlog_ticket *tic;
+ int num_lv;
+ int num_iovecs;
+ int len;
+ int error = 0;
+ struct xfs_trans_header thdr;
+ struct xfs_log_iovec lhdr;
+ struct xfs_log_vec lvhdr = { NULL };
+ xfs_lsn_t commit_lsn;
+
+ if (!cil)
+ return 0;
+
+ ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
+
+ new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+ new_ctx->ticket = xlog_cil_ticket_alloc(log);
+
+ /*
+ * Lock out transaction commit, but don't block for background pushes
+ * unless we are well over the CIL space limit. See the definition of
+ * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
+ * used here.
+ */
+ if (!down_write_trylock(&cil->xc_ctx_lock)) {
+ if (!push_seq &&
+ cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
+ goto out_free_ticket;
+ down_write(&cil->xc_ctx_lock);
+ }
+ ctx = cil->xc_ctx;
+
+ /* check if we've anything to push */
+ if (list_empty(&cil->xc_cil))
+ goto out_skip;
+
+ /* check for spurious background flush */
+ if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ goto out_skip;
+
+ /* check for a previously pushed seqeunce */
+ if (push_seq && push_seq < cil->xc_ctx->sequence)
+ goto out_skip;
+
+ /*
+ * pull all the log vectors off the items in the CIL, and
+ * remove the items from the CIL. We don't need the CIL lock
+ * here because it's only needed on the transaction commit
+ * side which is currently locked out by the flush lock.
+ */
+ lv = NULL;
+ num_lv = 0;
+ num_iovecs = 0;
+ len = 0;
+ while (!list_empty(&cil->xc_cil)) {
+ struct xfs_log_item *item;
+ int i;
+
+ item = list_first_entry(&cil->xc_cil,
+ struct xfs_log_item, li_cil);
+ list_del_init(&item->li_cil);
+ if (!ctx->lv_chain)
+ ctx->lv_chain = item->li_lv;
+ else
+ lv->lv_next = item->li_lv;
+ lv = item->li_lv;
+ item->li_lv = NULL;
+
+ num_lv++;
+ num_iovecs += lv->lv_niovecs;
+ for (i = 0; i < lv->lv_niovecs; i++)
+ len += lv->lv_iovecp[i].i_len;
+ }
+
+ /*
+ * initialise the new context and attach it to the CIL. Then attach
+ * the current context to the CIL committing lsit so it can be found
+ * during log forces to extract the commit lsn of the sequence that
+ * needs to be forced.
+ */
+ INIT_LIST_HEAD(&new_ctx->committing);
+ INIT_LIST_HEAD(&new_ctx->busy_extents);
+ new_ctx->sequence = ctx->sequence + 1;
+ new_ctx->cil = cil;
+ cil->xc_ctx = new_ctx;
+
+ /*
+ * mirror the new sequence into the cil structure so that we can do
+ * unlocked checks against the current sequence in log forces without
+ * risking deferencing a freed context pointer.
+ */
+ cil->xc_current_sequence = new_ctx->sequence;
+
+ /*
+ * The switch is now done, so we can drop the context lock and move out
+ * of a shared context. We can't just go straight to the commit record,
+ * though - we need to synchronise with previous and future commits so
+ * that the commit records are correctly ordered in the log to ensure
+ * that we process items during log IO completion in the correct order.
+ *
+ * For example, if we get an EFI in one checkpoint and the EFD in the
+ * next (e.g. due to log forces), we do not want the checkpoint with
+ * the EFD to be committed before the checkpoint with the EFI. Hence
+ * we must strictly order the commit records of the checkpoints so
+ * that: a) the checkpoint callbacks are attached to the iclogs in the
+ * correct order; and b) the checkpoints are replayed in correct order
+ * in log recovery.
+ *
+ * Hence we need to add this context to the committing context list so
+ * that higher sequences will wait for us to write out a commit record
+ * before they do.
+ */
+ spin_lock(&cil->xc_cil_lock);
+ list_add(&ctx->committing, &cil->xc_committing);
+ spin_unlock(&cil->xc_cil_lock);
+ up_write(&cil->xc_ctx_lock);
+
+ /*
+ * Build a checkpoint transaction header and write it to the log to
+ * begin the transaction. We need to account for the space used by the
+ * transaction header here as it is not accounted for in xlog_write().
+ *
+ * The LSN we need to pass to the log items on transaction commit is
+ * the LSN reported by the first log vector write. If we use the commit
+ * record lsn then we can move the tail beyond the grant write head.
+ */
+ tic = ctx->ticket;
+ thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+ thdr.th_type = XFS_TRANS_CHECKPOINT;
+ thdr.th_tid = tic->t_tid;
+ thdr.th_num_items = num_iovecs;
+ lhdr.i_addr = &thdr;
+ lhdr.i_len = sizeof(xfs_trans_header_t);
+ lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
+ tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
+
+ lvhdr.lv_niovecs = 1;
+ lvhdr.lv_iovecp = &lhdr;
+ lvhdr.lv_next = ctx->lv_chain;
+
+ error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+ if (error)
+ goto out_abort_free_ticket;
+
+ /*
+ * now that we've written the checkpoint into the log, strictly
+ * order the commit records so replay will get them in the right order.
+ */
+restart:
+ spin_lock(&cil->xc_cil_lock);
+ list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
+ /*
+ * Higher sequences will wait for this one so skip them.
+ * Don't wait for own own sequence, either.
+ */
+ if (new_ctx->sequence >= ctx->sequence)
+ continue;
+ if (!new_ctx->commit_lsn) {
+ /*
+ * It is still being pushed! Wait for the push to
+ * complete, then start again from the beginning.
+ */
+ xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+ goto restart;
+ }
+ }
+ spin_unlock(&cil->xc_cil_lock);
+
+ /* xfs_log_done always frees the ticket on error. */
+ commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+ if (commit_lsn == -1)
+ goto out_abort;
+
+ /* attach all the transactions w/ busy extents to iclog */
+ ctx->log_cb.cb_func = xlog_cil_committed;
+ ctx->log_cb.cb_arg = ctx;
+ error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
+ if (error)
+ goto out_abort;
+
+ /*
+ * now the checkpoint commit is complete and we've attached the
+ * callbacks to the iclog we can assign the commit LSN to the context
+ * and wake up anyone who is waiting for the commit to complete.
+ */
+ spin_lock(&cil->xc_cil_lock);
+ ctx->commit_lsn = commit_lsn;
+ wake_up_all(&cil->xc_commit_wait);
+ spin_unlock(&cil->xc_cil_lock);
+
+ /* release the hounds! */
+ return xfs_log_release_iclog(log->l_mp, commit_iclog);
+
+out_skip:
+ up_write(&cil->xc_ctx_lock);
+out_free_ticket:
+ xfs_log_ticket_put(new_ctx->ticket);
+ kmem_free(new_ctx);
+ return 0;
+
+out_abort_free_ticket:
+ xfs_log_ticket_put(tic);
+out_abort:
+ xlog_cil_committed(ctx, XFS_LI_ABORTED);
+ return XFS_ERROR(EIO);
+}
+
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+void
+xfs_log_commit_cil(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_log_vec *log_vector,
+ xfs_lsn_t *commit_lsn,
+ int flags)
+{
+ struct log *log = mp->m_log;
+ int log_flags = 0;
+ int push = 0;
+
+ if (flags & XFS_TRANS_RELEASE_LOG_RES)
+ log_flags = XFS_LOG_REL_PERM_RESERV;
+
+ /*
+ * do all the hard work of formatting items (including memory
+ * allocation) outside the CIL context lock. This prevents stalling CIL
+ * pushes when we are low on memory and a transaction commit spends a
+ * lot of time in memory reclaim.
+ */
+ xlog_cil_format_items(log, log_vector);
+
+ /* lock out background commit */
+ down_read(&log->l_cilp->xc_ctx_lock);
+ if (commit_lsn)
+ *commit_lsn = log->l_cilp->xc_ctx->sequence;
+
+ xlog_cil_insert_items(log, log_vector, tp->t_ticket);
+
+ /* check we didn't blow the reservation */
+ if (tp->t_ticket->t_curr_res < 0)
+ xlog_print_tic_res(log->l_mp, tp->t_ticket);
+
+ /* attach the transaction to the CIL if it has any busy extents */
+ if (!list_empty(&tp->t_busy)) {
+ spin_lock(&log->l_cilp->xc_cil_lock);
+ list_splice_init(&tp->t_busy,
+ &log->l_cilp->xc_ctx->busy_extents);
+ spin_unlock(&log->l_cilp->xc_cil_lock);
+ }
+
+ tp->t_commit_lsn = *commit_lsn;
+ xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ xfs_trans_unreserve_and_mod_sb(tp);
+
+ /*
+ * Once all the items of the transaction have been copied to the CIL,
+ * the items can be unlocked and freed.
+ *
+ * This needs to be done before we drop the CIL context lock because we
+ * have to update state in the log items and unlock them before they go
+ * to disk. If we don't, then the CIL checkpoint can race with us and
+ * we can run checkpoint completion before we've updated and unlocked
+ * the log items. This affects (at least) processing of stale buffers,
+ * inodes and EFIs.
+ */
+ xfs_trans_free_items(tp, *commit_lsn, 0);
+
+ /* check for background commit before unlock */
+ if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+ push = 1;
+
+ up_read(&log->l_cilp->xc_ctx_lock);
+
+ /*
+ * We need to push CIL every so often so we don't cache more than we
+ * can fit in the log. The limit really is that a checkpoint can't be
+ * more than half the log (the current checkpoint is not allowed to
+ * overwrite the previous checkpoint), but commit latency and memory
+ * usage limit this to a smaller size in most cases.
+ */
+ if (push)
+ xlog_cil_push(log, 0);
+}
+
+/*
+ * Conditionally push the CIL based on the sequence passed in.
+ *
+ * We only need to push if we haven't already pushed the sequence
+ * number given. Hence the only time we will trigger a push here is
+ * if the push sequence is the same as the current context.
+ *
+ * We return the current commit lsn to allow the callers to determine if a
+ * iclog flush is necessary following this call.
+ *
+ * XXX: Initially, just push the CIL unconditionally and return whatever
+ * commit lsn is there. It'll be empty, so this is broken for now.
+ */
+xfs_lsn_t
+xlog_cil_force_lsn(
+ struct log *log,
+ xfs_lsn_t sequence)
+{
+ struct xfs_cil *cil = log->l_cilp;
+ struct xfs_cil_ctx *ctx;
+ xfs_lsn_t commit_lsn = NULLCOMMITLSN;
+
+ ASSERT(sequence <= cil->xc_current_sequence);
+
+ /*
+ * check to see if we need to force out the current context.
+ * xlog_cil_push() handles racing pushes for the same sequence,
+ * so no need to deal with it here.
+ */
+ if (sequence == cil->xc_current_sequence)
+ xlog_cil_push(log, sequence);
+
+ /*
+ * See if we can find a previous sequence still committing.
+ * We need to wait for all previous sequence commits to complete
+ * before allowing the force of push_seq to go ahead. Hence block
+ * on commits for those as well.
+ */
+restart:
+ spin_lock(&cil->xc_cil_lock);
+ list_for_each_entry(ctx, &cil->xc_committing, committing) {
+ if (ctx->sequence > sequence)
+ continue;
+ if (!ctx->commit_lsn) {
+ /*
+ * It is still being pushed! Wait for the push to
+ * complete, then start again from the beginning.
+ */
+ xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+ goto restart;
+ }
+ if (ctx->sequence != sequence)
+ continue;
+ /* found it! */
+ commit_lsn = ctx->commit_lsn;
+ }
+ spin_unlock(&cil->xc_cil_lock);
+ return commit_lsn;
+}
+
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+bool
+xfs_log_item_in_current_chkpt(
+ struct xfs_log_item *lip)
+{
+ struct xfs_cil_ctx *ctx;
+
+ if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
+ return false;
+ if (list_empty(&lip->li_cil))
+ return false;
+
+ ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+
+ /*
+ * li_seq is written on the first commit of a log item to record the
+ * first checkpoint it is written to. Hence if it is different to the
+ * current sequence, we're in a new checkpoint.
+ */
+ if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+ return false;
+ return true;
+}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
new file mode 100644
index 00000000..2d3b6a49
--- /dev/null
+++ b/fs/xfs/xfs_log_priv.h
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_LOG_PRIV_H__
+#define __XFS_LOG_PRIV_H__
+
+struct xfs_buf;
+struct log;
+struct xlog_ticket;
+struct xfs_mount;
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_MIN_ICLOGS 2
+#define XLOG_MAX_ICLOGS 8
+#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */
+#define XLOG_VERSION_1 1
+#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE (256*1024)
+#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
+#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
+#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
+ (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
+
+#define XLOG_HEADER_SIZE 512
+
+#define XLOG_REC_SHIFT(log) \
+ BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+#define XLOG_TOTAL_REC_SHIFT(log) \
+ BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+
+static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
+{
+ return ((xfs_lsn_t)cycle << 32) | block;
+}
+
+static inline uint xlog_get_cycle(char *ptr)
+{
+ if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+ return be32_to_cpu(*((__be32 *)ptr + 1));
+ else
+ return be32_to_cpu(*(__be32 *)ptr);
+}
+
+#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
+
+#ifdef __KERNEL__
+
+/*
+ * get client id from packed copy.
+ *
+ * this hack is here because the xlog_pack code copies four bytes
+ * of xlog_op_header containing the fields oh_clientid, oh_flags
+ * and oh_res2 into the packed copy.
+ *
+ * later on this four byte chunk is treated as an int and the
+ * client id is pulled out.
+ *
+ * this has endian issues, of course.
+ */
+static inline uint xlog_get_client_id(__be32 i)
+{
+ return be32_to_cpu(i) >> 24;
+}
+
+/*
+ * In core log state
+ */
+#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */
+#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
+#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */
+#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
+#define XLOG_STATE_DO_CALLBACK \
+ 0x0010 /* Process callback functions */
+#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
+#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
+#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
+#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
+#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
+#endif /* __KERNEL__ */
+
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS. Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions. Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS 0x01 /* Start a new transaction */
+#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */
+#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */
+#define XLOG_END_TRANS 0x10 /* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
+
+#ifdef __KERNEL__
+/*
+ * Flags to log ticket
+ */
+#define XLOG_TIC_INITED 0x1 /* has been initialized */
+#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
+
+#define XLOG_TIC_FLAGS \
+ { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
+ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
+
+#endif /* __KERNEL__ */
+
+#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
+
+/*
+ * Flags for log structure
+ */
+#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */
+#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
+#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
+#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
+ shutdown */
+#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
+
+typedef __uint32_t xlog_tid_t;
+
+#ifdef __KERNEL__
+/*
+ * Below are states for covering allocation transactions.
+ * By covering, we mean changing the h_tail_lsn in the last on-disk
+ * log write such that no allocation transactions will be re-done during
+ * recovery after a system crash. Recovery starts at the last on-disk
+ * log write.
+ *
+ * These states are used to insert dummy log entries to cover
+ * space allocation transactions which can undo non-transactional changes
+ * after a crash. Writes to a file with space
+ * already allocated do not result in any transactions. Allocations
+ * might include space beyond the EOF. So if we just push the EOF a
+ * little, the last transaction for the file could contain the wrong
+ * size. If there is no file system activity, after an allocation
+ * transaction, and the system crashes, the allocation transaction
+ * will get replayed and the file will be truncated. This could
+ * be hours/days/... after the allocation occurred.
+ *
+ * The fix for this is to do two dummy transactions when the
+ * system is idle. We need two dummy transaction because the h_tail_lsn
+ * in the log record header needs to point beyond the last possible
+ * non-dummy transaction. The first dummy changes the h_tail_lsn to
+ * the first transaction before the dummy. The second dummy causes
+ * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn.
+ *
+ * These dummy transactions get committed when everything
+ * is idle (after there has been some activity).
+ *
+ * There are 5 states used to control this.
+ *
+ * IDLE -- no logging has been done on the file system or
+ * we are done covering previous transactions.
+ * NEED -- logging has occurred and we need a dummy transaction
+ * when the log becomes idle.
+ * DONE -- we were in the NEED state and have committed a dummy
+ * transaction.
+ * NEED2 -- we detected that a dummy transaction has gone to the
+ * on disk log with no other transactions.
+ * DONE2 -- we committed a dummy transaction when in the NEED2 state.
+ *
+ * There are two places where we switch states:
+ *
+ * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2.
+ * We commit the dummy transaction and switch to DONE or DONE2,
+ * respectively. In all other states, we don't do anything.
+ *
+ * 2.) When we finish writing the on-disk log (xlog_state_clean_log).
+ *
+ * No matter what state we are in, if this isn't the dummy
+ * transaction going out, the next state is NEED.
+ * So, if we aren't in the DONE or DONE2 states, the next state
+ * is NEED. We can't be finishing a write of the dummy record
+ * unless it was committed and the state switched to DONE or DONE2.
+ *
+ * If we are in the DONE state and this was a write of the
+ * dummy transaction, we move to NEED2.
+ *
+ * If we are in the DONE2 state and this was a write of the
+ * dummy transaction, we move to IDLE.
+ *
+ *
+ * Writing only one dummy transaction can get appended to
+ * one file space allocation. When this happens, the log recovery
+ * code replays the space allocation and a file could be truncated.
+ * This is why we have the NEED2 and DONE2 states before going idle.
+ */
+
+#define XLOG_STATE_COVER_IDLE 0
+#define XLOG_STATE_COVER_NEED 1
+#define XLOG_STATE_COVER_DONE 2
+#define XLOG_STATE_COVER_NEED2 3
+#define XLOG_STATE_COVER_DONE2 4
+
+#define XLOG_COVER_OPS 5
+
+
+/* Ticket reservation region accounting */
+#define XLOG_TIC_LEN_MAX 15
+
+/*
+ * Reservation region
+ * As would be stored in xfs_log_iovec but without the i_addr which
+ * we don't care about.
+ */
+typedef struct xlog_res {
+ uint r_len; /* region length :4 */
+ uint r_type; /* region's transaction type :4 */
+} xlog_res_t;
+
+typedef struct xlog_ticket {
+ wait_queue_head_t t_wait; /* ticket wait queue */
+ struct list_head t_queue; /* reserve/write queue */
+ xlog_tid_t t_tid; /* transaction identifier : 4 */
+ atomic_t t_ref; /* ticket reference count : 4 */
+ int t_curr_res; /* current reservation in bytes : 4 */
+ int t_unit_res; /* unit reservation in bytes : 4 */
+ char t_ocnt; /* original count : 1 */
+ char t_cnt; /* current count : 1 */
+ char t_clientid; /* who does this belong to; : 1 */
+ char t_flags; /* properties of reservation : 1 */
+ uint t_trans_type; /* transaction type : 4 */
+
+ /* reservation array fields */
+ uint t_res_num; /* num in array : 4 */
+ uint t_res_num_ophdrs; /* num op hdrs : 4 */
+ uint t_res_arr_sum; /* array sum : 4 */
+ uint t_res_o_flow; /* sum overflow : 4 */
+ xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
+} xlog_ticket_t;
+
+#endif
+
+
+typedef struct xlog_op_header {
+ __be32 oh_tid; /* transaction id of operation : 4 b */
+ __be32 oh_len; /* bytes in data region : 4 b */
+ __u8 oh_clientid; /* who sent me this : 1 b */
+ __u8 oh_flags; /* : 1 b */
+ __u16 oh_res2; /* 32 bit align : 2 b */
+} xlog_op_header_t;
+
+
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN 0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE 3
+
+/* our fmt */
+#ifdef XFS_NATIVE_HOST
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#endif
+
+typedef struct xlog_rec_header {
+ __be32 h_magicno; /* log record (LR) identifier : 4 */
+ __be32 h_cycle; /* write cycle of log : 4 */
+ __be32 h_version; /* LR version : 4 */
+ __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
+ __be64 h_lsn; /* lsn of this LR : 8 */
+ __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
+ __be32 h_chksum; /* may not be used; non-zero if used : 4 */
+ __be32 h_prev_block; /* block number to previous LR : 4 */
+ __be32 h_num_logops; /* number of log operations in this LR : 4 */
+ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+ /* new fields */
+ __be32 h_fmt; /* format of log record : 4 */
+ uuid_t h_fs_uuid; /* uuid of FS : 16 */
+ __be32 h_size; /* iclog size : 4 */
+} xlog_rec_header_t;
+
+typedef struct xlog_rec_ext_header {
+ __be32 xh_cycle; /* write cycle of log : 4 */
+ __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
+} xlog_rec_ext_header_t;
+
+#ifdef __KERNEL__
+
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+ xlog_rec_header_t hic_header;
+ xlog_rec_ext_header_t hic_xheader;
+ char hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+
+/*
+ * - A log record header is 512 bytes. There is plenty of room to grow the
+ * xlog_rec_header_t into the reserved space.
+ * - ic_data follows, so a write to disk can start at the beginning of
+ * the iclog.
+ * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
+ * - ic_next is the pointer to the next iclog in the ring.
+ * - ic_bp is a pointer to the buffer used to write this incore log to disk.
+ * - ic_log is a pointer back to the global log structure.
+ * - ic_callback is a linked list of callback function/argument pairs to be
+ * called after an iclog finishes writing.
+ * - ic_size is the full size of the header plus data.
+ * - ic_offset is the current number of bytes written to in this iclog.
+ * - ic_refcnt is bumped when someone is writing to the log.
+ * - ic_state is the state of the iclog.
+ *
+ * Because of cacheline contention on large machines, we need to separate
+ * various resources onto different cachelines. To start with, make the
+ * structure cacheline aligned. The following fields can be contended on
+ * by independent processes:
+ *
+ * - ic_callback_*
+ * - ic_refcnt
+ * - fields protected by the global l_icloglock
+ *
+ * so we need to ensure that these fields are located in separate cachelines.
+ * We'll put all the read-only and l_icloglock fields in the first cacheline,
+ * and move everything else out to subsequent cachelines.
+ */
+typedef struct xlog_in_core {
+ wait_queue_head_t ic_force_wait;
+ wait_queue_head_t ic_write_wait;
+ struct xlog_in_core *ic_next;
+ struct xlog_in_core *ic_prev;
+ struct xfs_buf *ic_bp;
+ struct log *ic_log;
+ int ic_size;
+ int ic_offset;
+ int ic_bwritecnt;
+ unsigned short ic_state;
+ char *ic_datap; /* pointer to iclog data */
+
+ /* Callback structures need their own cacheline */
+ spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
+ xfs_log_callback_t *ic_callback;
+ xfs_log_callback_t **ic_callback_tail;
+
+ /* reference counts need their own cacheline */
+ atomic_t ic_refcnt ____cacheline_aligned_in_smp;
+ xlog_in_core_2_t *ic_data;
+#define ic_header ic_data->hic_header
+} xlog_in_core_t;
+
+/*
+ * The CIL context is used to aggregate per-transaction details as well be
+ * passed to the iclog for checkpoint post-commit processing. After being
+ * passed to the iclog, another context needs to be allocated for tracking the
+ * next set of transactions to be aggregated into a checkpoint.
+ */
+struct xfs_cil;
+
+struct xfs_cil_ctx {
+ struct xfs_cil *cil;
+ xfs_lsn_t sequence; /* chkpt sequence # */
+ xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
+ xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
+ struct xlog_ticket *ticket; /* chkpt ticket */
+ int nvecs; /* number of regions */
+ int space_used; /* aggregate size of regions */
+ struct list_head busy_extents; /* busy extents in chkpt */
+ struct xfs_log_vec *lv_chain; /* logvecs being pushed */
+ xfs_log_callback_t log_cb; /* completion callback hook. */
+ struct list_head committing; /* ctx committing list */
+};
+
+/*
+ * Committed Item List structure
+ *
+ * This structure is used to track log items that have been committed but not
+ * yet written into the log. It is used only when the delayed logging mount
+ * option is enabled.
+ *
+ * This structure tracks the list of committing checkpoint contexts so
+ * we can avoid the problem of having to hold out new transactions during a
+ * flush until we have a the commit record LSN of the checkpoint. We can
+ * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
+ * sequence match and extract the commit LSN directly from there. If the
+ * checkpoint is still in the process of committing, we can block waiting for
+ * the commit LSN to be determined as well. This should make synchronous
+ * operations almost as efficient as the old logging methods.
+ */
+struct xfs_cil {
+ struct log *xc_log;
+ struct list_head xc_cil;
+ spinlock_t xc_cil_lock;
+ struct xfs_cil_ctx *xc_ctx;
+ struct rw_semaphore xc_ctx_lock;
+ struct list_head xc_committing;
+ wait_queue_head_t xc_commit_wait;
+ xfs_lsn_t xc_current_sequence;
+};
+
+/*
+ * The amount of log space we allow the CIL to aggregate is difficult to size.
+ * Whatever we choose, we have to make sure we can get a reservation for the
+ * log space effectively, that it is large enough to capture sufficient
+ * relogging to reduce log buffer IO significantly, but it is not too large for
+ * the log or induces too much latency when writing out through the iclogs. We
+ * track both space consumed and the number of vectors in the checkpoint
+ * context, so we need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers. The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can effectively make up arbitrary limits for
+ * the checkpoint size so long as they don't violate any other size rules.
+ * Recovery imposes a rule that no transaction exceed half the log, so we are
+ * limited by that. Furthermore, the log transaction reservation subsystem
+ * tries to keep 25% of the log free, so we need to keep below that limit or we
+ * risk running out of free log space to start any new transactions.
+ *
+ * In order to keep background CIL push efficient, we will set a lower
+ * threshold at which background pushing is attempted without blocking current
+ * transaction commits. A separate, higher bound defines when CIL pushes are
+ * enforced to ensure we stay within our maximum checkpoint size bounds.
+ * threshold, yet give us plenty of space for aggregation on large logs.
+ */
+#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
+#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
+
+/*
+ * The reservation head lsn is not made up of a cycle number and block number.
+ * Instead, it uses a cycle number and byte number. Logs don't expect to
+ * overflow 31 bits worth of byte offset, so using a byte number will mean
+ * that round off problems won't occur when releasing partial reservations.
+ */
+typedef struct log {
+ /* The following fields don't need locking */
+ struct xfs_mount *l_mp; /* mount point */
+ struct xfs_ail *l_ailp; /* AIL log is working with */
+ struct xfs_cil *l_cilp; /* CIL log is working with */
+ struct xfs_buf *l_xbuf; /* extra buffer for log
+ * wrapping */
+ struct xfs_buftarg *l_targ; /* buftarg of log */
+ uint l_flags;
+ uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+ struct list_head *l_buf_cancel_table;
+ int l_iclog_hsize; /* size of iclog header */
+ int l_iclog_heads; /* # of iclog header sectors */
+ uint l_sectBBsize; /* sector size in BBs (2^n) */
+ int l_iclog_size; /* size of log in bytes */
+ int l_iclog_size_log; /* log power size of log */
+ int l_iclog_bufs; /* number of iclog buffers */
+ xfs_daddr_t l_logBBstart; /* start block of log */
+ int l_logsize; /* size of log in bytes */
+ int l_logBBsize; /* size of log in BB chunks */
+
+ /* The following block of fields are changed while holding icloglock */
+ wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp;
+ /* waiting for iclog flush */
+ int l_covered_state;/* state of "covering disk
+ * log entries" */
+ xlog_in_core_t *l_iclog; /* head log queue */
+ spinlock_t l_icloglock; /* grab to change iclog state */
+ int l_curr_cycle; /* Cycle number of log writes */
+ int l_prev_cycle; /* Cycle number before last
+ * block increment */
+ int l_curr_block; /* current logical log block */
+ int l_prev_block; /* previous logical log block */
+
+ /*
+ * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
+ * read without needing to hold specific locks. To avoid operations
+ * contending with other hot objects, place each of them on a separate
+ * cacheline.
+ */
+ /* lsn of last LR on disk */
+ atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
+ /* lsn of 1st LR with unflushed * buffers */
+ atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
+
+ /*
+ * ticket grant locks, queues and accounting have their own cachlines
+ * as these are quite hot and can be operated on concurrently.
+ */
+ spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
+ struct list_head l_reserveq;
+ atomic64_t l_grant_reserve_head;
+
+ spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
+ struct list_head l_writeq;
+ atomic64_t l_grant_write_head;
+
+ /* The following field are used for debugging; need to hold icloglock */
+#ifdef DEBUG
+ char *l_iclog_bak[XLOG_MAX_ICLOGS];
+#endif
+
+} xlog_t;
+
+#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
+ ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
+
+#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
+
+/* common routines */
+extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
+extern int xlog_recover(xlog_t *log);
+extern int xlog_recover_finish(xlog_t *log);
+extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
+
+extern kmem_zone_t *xfs_log_ticket_zone;
+struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
+ int count, char client, uint xflags,
+ int alloc_flags);
+
+
+static inline void
+xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
+{
+ *ptr += bytes;
+ *len -= bytes;
+ *off += bytes;
+}
+
+void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
+ struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+ xlog_in_core_t **commit_iclog, uint flags);
+
+/*
+ * When we crack an atomic LSN, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from. This should always
+ * be used to sample and crack LSNs that are stored and updated in atomic
+ * variables.
+ */
+static inline void
+xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
+{
+ xfs_lsn_t val = atomic64_read(lsn);
+
+ *cycle = CYCLE_LSN(val);
+ *block = BLOCK_LSN(val);
+}
+
+/*
+ * Calculate and assign a value to an atomic LSN variable from component pieces.
+ */
+static inline void
+xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
+{
+ atomic64_set(lsn, xlog_assign_lsn(cycle, block));
+}
+
+/*
+ * When we crack the grant head, we sample it first so that the value will not
+ * change while we are cracking it into the component values. This means we
+ * will always get consistent component values to work from.
+ */
+static inline void
+xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
+{
+ *cycle = val >> 32;
+ *space = val & 0xffffffff;
+}
+
+static inline void
+xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
+{
+ xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
+}
+
+static inline int64_t
+xlog_assign_grant_head_val(int cycle, int space)
+{
+ return ((int64_t)cycle << 32) | space;
+}
+
+static inline void
+xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
+{
+ atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
+}
+
+/*
+ * Committed Item List interfaces
+ */
+int xlog_cil_init(struct log *log);
+void xlog_cil_init_post_recovery(struct log *log);
+void xlog_cil_destroy(struct log *log);
+
+/*
+ * CIL force routines
+ */
+xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
+
+static inline void
+xlog_cil_force(struct log *log)
+{
+ xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+}
+
+/*
+ * Unmount record type is used as a pseudo transaction type for the ticket.
+ * It's value must be outside the range of XFS_TRANS_* values.
+ */
+#define XLOG_UNMOUNT_REC_TYPE (-1U)
+
+/*
+ * Wrapper function for waiting on a wait queue serialised against wakeups
+ * by a spinlock. This matches the semantics of all the wait queues used in the
+ * log code.
+ */
+static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue_exclusive(wq, &wait);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(lock);
+ schedule();
+ remove_wait_queue(wq, &wait);
+}
+#endif /* __KERNEL__ */
+
+#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
new file mode 100644
index 00000000..b75fd67c
--- /dev/null
+++ b/fs/xfs/xfs_log_recover.c
@@ -0,0 +1,3843 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_log_recover.h"
+#include "xfs_extfree_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_quota.h"
+#include "xfs_rw.h"
+#include "xfs_utils.h"
+#include "xfs_trace.h"
+
+STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
+STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
+#if defined(DEBUG)
+STATIC void xlog_recover_check_summary(xlog_t *);
+#else
+#define xlog_recover_check_summary(log)
+#endif
+
+/*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+ xfs_daddr_t bc_blkno;
+ uint bc_len;
+ int bc_refcount;
+ struct list_head bc_list;
+};
+
+/*
+ * Sector aligned buffer routines for buffer create/read/write/access
+ */
+
+/*
+ * Verify the given count of basic blocks is valid number of blocks
+ * to specify for an operation involving the given XFS log buffer.
+ * Returns nonzero if the count is valid, 0 otherwise.
+ */
+
+static inline int
+xlog_buf_bbcount_valid(
+ xlog_t *log,
+ int bbcount)
+{
+ return bbcount > 0 && bbcount <= log->l_logBBsize;
+}
+
+/*
+ * Allocate a buffer to hold log data. The buffer needs to be able
+ * to map to a range of nbblks basic blocks at any valid (basic
+ * block) offset within the log.
+ */
+STATIC xfs_buf_t *
+xlog_get_bp(
+ xlog_t *log,
+ int nbblks)
+{
+ if (!xlog_buf_bbcount_valid(log, nbblks)) {
+ xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
+ nbblks);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
+ return NULL;
+ }
+
+ /*
+ * We do log I/O in units of log sectors (a power-of-2
+ * multiple of the basic block size), so we round up the
+ * requested size to accommodate the basic blocks required
+ * for complete log sectors.
+ *
+ * In addition, the buffer may be used for a non-sector-
+ * aligned block offset, in which case an I/O of the
+ * requested size could extend beyond the end of the
+ * buffer. If the requested size is only 1 basic block it
+ * will never straddle a sector boundary, so this won't be
+ * an issue. Nor will this be a problem if the log I/O is
+ * done in basic blocks (sector size 1). But otherwise we
+ * extend the buffer by one extra log sector to ensure
+ * there's space to accommodate this possibility.
+ */
+ if (nbblks > 1 && log->l_sectBBsize > 1)
+ nbblks += log->l_sectBBsize;
+ nbblks = round_up(nbblks, log->l_sectBBsize);
+
+ return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
+ BBTOB(nbblks), 0);
+}
+
+STATIC void
+xlog_put_bp(
+ xfs_buf_t *bp)
+{
+ xfs_buf_free(bp);
+}
+
+/*
+ * Return the address of the start of the given block number's data
+ * in a log buffer. The buffer covers a log sector-aligned region.
+ */
+STATIC xfs_caddr_t
+xlog_align(
+ xlog_t *log,
+ xfs_daddr_t blk_no,
+ int nbblks,
+ xfs_buf_t *bp)
+{
+ xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
+
+ ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
+ return XFS_BUF_PTR(bp) + BBTOB(offset);
+}
+
+
+/*
+ * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
+ */
+STATIC int
+xlog_bread_noalign(
+ xlog_t *log,
+ xfs_daddr_t blk_no,
+ int nbblks,
+ xfs_buf_t *bp)
+{
+ int error;
+
+ if (!xlog_buf_bbcount_valid(log, nbblks)) {
+ xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
+ nbblks);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
+ return EFSCORRUPTED;
+ }
+
+ blk_no = round_down(blk_no, log->l_sectBBsize);
+ nbblks = round_up(nbblks, log->l_sectBBsize);
+
+ ASSERT(nbblks > 0);
+ ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+
+ XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+ XFS_BUF_READ(bp);
+ XFS_BUF_BUSY(bp);
+ XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+ XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
+
+ xfsbdstrat(log->l_mp, bp);
+ error = xfs_buf_iowait(bp);
+ if (error)
+ xfs_ioerror_alert("xlog_bread", log->l_mp,
+ bp, XFS_BUF_ADDR(bp));
+ return error;
+}
+
+STATIC int
+xlog_bread(
+ xlog_t *log,
+ xfs_daddr_t blk_no,
+ int nbblks,
+ xfs_buf_t *bp,
+ xfs_caddr_t *offset)
+{
+ int error;
+
+ error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+ if (error)
+ return error;
+
+ *offset = xlog_align(log, blk_no, nbblks, bp);
+ return 0;
+}
+
+/*
+ * Read at an offset into the buffer. Returns with the buffer in it's original
+ * state regardless of the result of the read.
+ */
+STATIC int
+xlog_bread_offset(
+ xlog_t *log,
+ xfs_daddr_t blk_no, /* block to read from */
+ int nbblks, /* blocks to read */
+ xfs_buf_t *bp,
+ xfs_caddr_t offset)
+{
+ xfs_caddr_t orig_offset = XFS_BUF_PTR(bp);
+ int orig_len = bp->b_buffer_length;
+ int error, error2;
+
+ error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
+ if (error)
+ return error;
+
+ error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+
+ /* must reset buffer pointer even on error */
+ error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
+ if (error)
+ return error;
+ return error2;
+}
+
+/*
+ * Write out the buffer at the given block for the given number of blocks.
+ * The buffer is kept locked across the write and is returned locked.
+ * This can only be used for synchronous log writes.
+ */
+STATIC int
+xlog_bwrite(
+ xlog_t *log,
+ xfs_daddr_t blk_no,
+ int nbblks,
+ xfs_buf_t *bp)
+{
+ int error;
+
+ if (!xlog_buf_bbcount_valid(log, nbblks)) {
+ xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
+ nbblks);
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
+ return EFSCORRUPTED;
+ }
+
+ blk_no = round_down(blk_no, log->l_sectBBsize);
+ nbblks = round_up(nbblks, log->l_sectBBsize);
+
+ ASSERT(nbblks > 0);
+ ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+
+ XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+ XFS_BUF_ZEROFLAGS(bp);
+ XFS_BUF_BUSY(bp);
+ XFS_BUF_HOLD(bp);
+ XFS_BUF_PSEMA(bp, PRIBIO);
+ XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+ XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
+
+ if ((error = xfs_bwrite(log->l_mp, bp)))
+ xfs_ioerror_alert("xlog_bwrite", log->l_mp,
+ bp, XFS_BUF_ADDR(bp));
+ return error;
+}
+
+#ifdef DEBUG
+/*
+ * dump debug superblock and log record information
+ */
+STATIC void
+xlog_header_check_dump(
+ xfs_mount_t *mp,
+ xlog_rec_header_t *head)
+{
+ xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n",
+ __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
+ xfs_debug(mp, " log : uuid = %pU, fmt = %d\n",
+ &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
+}
+#else
+#define xlog_header_check_dump(mp, head)
+#endif
+
+/*
+ * check log record header for recovery
+ */
+STATIC int
+xlog_header_check_recover(
+ xfs_mount_t *mp,
+ xlog_rec_header_t *head)
+{
+ ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
+
+ /*
+ * IRIX doesn't write the h_fmt field and leaves it zeroed
+ * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
+ * a dirty log created in IRIX.
+ */
+ if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
+ xfs_warn(mp,
+ "dirty log written in incompatible format - can't recover");
+ xlog_header_check_dump(mp, head);
+ XFS_ERROR_REPORT("xlog_header_check_recover(1)",
+ XFS_ERRLEVEL_HIGH, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+ xfs_warn(mp,
+ "dirty log entry has mismatched uuid - can't recover");
+ xlog_header_check_dump(mp, head);
+ XFS_ERROR_REPORT("xlog_header_check_recover(2)",
+ XFS_ERRLEVEL_HIGH, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ return 0;
+}
+
+/*
+ * read the head block of the log and check the header
+ */
+STATIC int
+xlog_header_check_mount(
+ xfs_mount_t *mp,
+ xlog_rec_header_t *head)
+{
+ ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
+
+ if (uuid_is_nil(&head->h_fs_uuid)) {
+ /*
+ * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
+ * h_fs_uuid is nil, we assume this log was last mounted
+ * by IRIX and continue.
+ */
+ xfs_warn(mp, "nil uuid in log - IRIX style log");
+ } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+ xfs_warn(mp, "log has mismatched uuid - can't recover");
+ xlog_header_check_dump(mp, head);
+ XFS_ERROR_REPORT("xlog_header_check_mount",
+ XFS_ERRLEVEL_HIGH, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ return 0;
+}
+
+STATIC void
+xlog_recover_iodone(
+ struct xfs_buf *bp)
+{
+ if (XFS_BUF_GETERROR(bp)) {
+ /*
+ * We're not going to bother about retrying
+ * this during recovery. One strike!
+ */
+ xfs_ioerror_alert("xlog_recover_iodone",
+ bp->b_target->bt_mount, bp,
+ XFS_BUF_ADDR(bp));
+ xfs_force_shutdown(bp->b_target->bt_mount,
+ SHUTDOWN_META_IO_ERROR);
+ }
+ XFS_BUF_CLR_IODONE_FUNC(bp);
+ xfs_buf_ioend(bp, 0);
+}
+
+/*
+ * This routine finds (to an approximation) the first block in the physical
+ * log which contains the given cycle. It uses a binary search algorithm.
+ * Note that the algorithm can not be perfect because the disk will not
+ * necessarily be perfect.
+ */
+STATIC int
+xlog_find_cycle_start(
+ xlog_t *log,
+ xfs_buf_t *bp,
+ xfs_daddr_t first_blk,
+ xfs_daddr_t *last_blk,
+ uint cycle)
+{
+ xfs_caddr_t offset;
+ xfs_daddr_t mid_blk;
+ xfs_daddr_t end_blk;
+ uint mid_cycle;
+ int error;
+
+ end_blk = *last_blk;
+ mid_blk = BLK_AVG(first_blk, end_blk);
+ while (mid_blk != first_blk && mid_blk != end_blk) {
+ error = xlog_bread(log, mid_blk, 1, bp, &offset);
+ if (error)
+ return error;
+ mid_cycle = xlog_get_cycle(offset);
+ if (mid_cycle == cycle)
+ end_blk = mid_blk; /* last_half_cycle == mid_cycle */
+ else
+ first_blk = mid_blk; /* first_half_cycle == mid_cycle */
+ mid_blk = BLK_AVG(first_blk, end_blk);
+ }
+ ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
+ (mid_blk == end_blk && mid_blk-1 == first_blk));
+
+ *last_blk = end_blk;
+
+ return 0;
+}
+
+/*
+ * Check that a range of blocks does not contain stop_on_cycle_no.
+ * Fill in *new_blk with the block offset where such a block is
+ * found, or with -1 (an invalid block number) if there is no such
+ * block in the range. The scan needs to occur from front to back
+ * and the pointer into the region must be updated since a later
+ * routine will need to perform another test.
+ */
+STATIC int
+xlog_find_verify_cycle(
+ xlog_t *log,
+ xfs_daddr_t start_blk,
+ int nbblks,
+ uint stop_on_cycle_no,
+ xfs_daddr_t *new_blk)
+{
+ xfs_daddr_t i, j;
+ uint cycle;
+ xfs_buf_t *bp;
+ xfs_daddr_t bufblks;
+ xfs_caddr_t buf = NULL;
+ int error = 0;
+
+ /*
+ * Greedily allocate a buffer big enough to handle the full
+ * range of basic blocks we'll be examining. If that fails,
+ * try a smaller size. We need to be able to read at least
+ * a log sector, or we're out of luck.
+ */
+ bufblks = 1 << ffs(nbblks);
+ while (!(bp = xlog_get_bp(log, bufblks))) {
+ bufblks >>= 1;
+ if (bufblks < log->l_sectBBsize)
+ return ENOMEM;
+ }
+
+ for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
+ int bcount;
+
+ bcount = min(bufblks, (start_blk + nbblks - i));
+
+ error = xlog_bread(log, i, bcount, bp, &buf);
+ if (error)
+ goto out;
+
+ for (j = 0; j < bcount; j++) {
+ cycle = xlog_get_cycle(buf);
+ if (cycle == stop_on_cycle_no) {
+ *new_blk = i+j;
+ goto out;
+ }
+
+ buf += BBSIZE;
+ }
+ }
+
+ *new_blk = -1;
+
+out:
+ xlog_put_bp(bp);
+ return error;
+}
+
+/*
+ * Potentially backup over partial log record write.
+ *
+ * In the typical case, last_blk is the number of the block directly after
+ * a good log record. Therefore, we subtract one to get the block number
+ * of the last block in the given buffer. extra_bblks contains the number
+ * of blocks we would have read on a previous read. This happens when the
+ * last log record is split over the end of the physical log.
+ *
+ * extra_bblks is the number of blocks potentially verified on a previous
+ * call to this routine.
+ */
+STATIC int
+xlog_find_verify_log_record(
+ xlog_t *log,
+ xfs_daddr_t start_blk,
+ xfs_daddr_t *last_blk,
+ int extra_bblks)
+{
+ xfs_daddr_t i;
+ xfs_buf_t *bp;
+ xfs_caddr_t offset = NULL;
+ xlog_rec_header_t *head = NULL;
+ int error = 0;
+ int smallmem = 0;
+ int num_blks = *last_blk - start_blk;
+ int xhdrs;
+
+ ASSERT(start_blk != 0 || *last_blk != start_blk);
+
+ if (!(bp = xlog_get_bp(log, num_blks))) {
+ if (!(bp = xlog_get_bp(log, 1)))
+ return ENOMEM;
+ smallmem = 1;
+ } else {
+ error = xlog_bread(log, start_blk, num_blks, bp, &offset);
+ if (error)
+ goto out;
+ offset += ((num_blks - 1) << BBSHIFT);
+ }
+
+ for (i = (*last_blk) - 1; i >= 0; i--) {
+ if (i < start_blk) {
+ /* valid log record not found */
+ xfs_warn(log->l_mp,
+ "Log inconsistent (didn't find previous header)");
+ ASSERT(0);
+ error = XFS_ERROR(EIO);
+ goto out;
+ }
+
+ if (smallmem) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out;
+ }
+
+ head = (xlog_rec_header_t *)offset;
+
+ if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno))
+ break;
+
+ if (!smallmem)
+ offset -= BBSIZE;
+ }
+
+ /*
+ * We hit the beginning of the physical log & still no header. Return
+ * to caller. If caller can handle a return of -1, then this routine
+ * will be called again for the end of the physical log.
+ */
+ if (i == -1) {
+ error = -1;
+ goto out;
+ }
+
+ /*
+ * We have the final block of the good log (the first block
+ * of the log record _before_ the head. So we check the uuid.
+ */
+ if ((error = xlog_header_check_mount(log->l_mp, head)))
+ goto out;
+
+ /*
+ * We may have found a log record header before we expected one.
+ * last_blk will be the 1st block # with a given cycle #. We may end
+ * up reading an entire log record. In this case, we don't want to
+ * reset last_blk. Only when last_blk points in the middle of a log
+ * record do we update last_blk.
+ */
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ uint h_size = be32_to_cpu(head->h_size);
+
+ xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
+ if (h_size % XLOG_HEADER_CYCLE_SIZE)
+ xhdrs++;
+ } else {
+ xhdrs = 1;
+ }
+
+ if (*last_blk - i + extra_bblks !=
+ BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
+ *last_blk = i;
+
+out:
+ xlog_put_bp(bp);
+ return error;
+}
+
+/*
+ * Head is defined to be the point of the log where the next log write
+ * write could go. This means that incomplete LR writes at the end are
+ * eliminated when calculating the head. We aren't guaranteed that previous
+ * LR have complete transactions. We only know that a cycle number of
+ * current cycle number -1 won't be present in the log if we start writing
+ * from our current block number.
+ *
+ * last_blk contains the block number of the first block with a given
+ * cycle number.
+ *
+ * Return: zero if normal, non-zero if error.
+ */
+STATIC int
+xlog_find_head(
+ xlog_t *log,
+ xfs_daddr_t *return_head_blk)
+{
+ xfs_buf_t *bp;
+ xfs_caddr_t offset;
+ xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
+ int num_scan_bblks;
+ uint first_half_cycle, last_half_cycle;
+ uint stop_on_cycle;
+ int error, log_bbnum = log->l_logBBsize;
+
+ /* Is the end of the log device zeroed? */
+ if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
+ *return_head_blk = first_blk;
+
+ /* Is the whole lot zeroed? */
+ if (!first_blk) {
+ /* Linux XFS shouldn't generate totally zeroed logs -
+ * mkfs etc write a dummy unmount record to a fresh
+ * log so we can store the uuid in there
+ */
+ xfs_warn(log->l_mp, "totally zeroed log");
+ }
+
+ return 0;
+ } else if (error) {
+ xfs_warn(log->l_mp, "empty log check failed");
+ return error;
+ }
+
+ first_blk = 0; /* get cycle # of 1st block */
+ bp = xlog_get_bp(log, 1);
+ if (!bp)
+ return ENOMEM;
+
+ error = xlog_bread(log, 0, 1, bp, &offset);
+ if (error)
+ goto bp_err;
+
+ first_half_cycle = xlog_get_cycle(offset);
+
+ last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
+ error = xlog_bread(log, last_blk, 1, bp, &offset);
+ if (error)
+ goto bp_err;
+
+ last_half_cycle = xlog_get_cycle(offset);
+ ASSERT(last_half_cycle != 0);
+
+ /*
+ * If the 1st half cycle number is equal to the last half cycle number,
+ * then the entire log is stamped with the same cycle number. In this
+ * case, head_blk can't be set to zero (which makes sense). The below
+ * math doesn't work out properly with head_blk equal to zero. Instead,
+ * we set it to log_bbnum which is an invalid block number, but this
+ * value makes the math correct. If head_blk doesn't changed through
+ * all the tests below, *head_blk is set to zero at the very end rather
+ * than log_bbnum. In a sense, log_bbnum and zero are the same block
+ * in a circular file.
+ */
+ if (first_half_cycle == last_half_cycle) {
+ /*
+ * In this case we believe that the entire log should have
+ * cycle number last_half_cycle. We need to scan backwards
+ * from the end verifying that there are no holes still
+ * containing last_half_cycle - 1. If we find such a hole,
+ * then the start of that hole will be the new head. The
+ * simple case looks like
+ * x | x ... | x - 1 | x
+ * Another case that fits this picture would be
+ * x | x + 1 | x ... | x
+ * In this case the head really is somewhere at the end of the
+ * log, as one of the latest writes at the beginning was
+ * incomplete.
+ * One more case is
+ * x | x + 1 | x ... | x - 1 | x
+ * This is really the combination of the above two cases, and
+ * the head has to end up at the start of the x-1 hole at the
+ * end of the log.
+ *
+ * In the 256k log case, we will read from the beginning to the
+ * end of the log and search for cycle numbers equal to x-1.
+ * We don't worry about the x+1 blocks that we encounter,
+ * because we know that they cannot be the head since the log
+ * started with x.
+ */
+ head_blk = log_bbnum;
+ stop_on_cycle = last_half_cycle - 1;
+ } else {
+ /*
+ * In this case we want to find the first block with cycle
+ * number matching last_half_cycle. We expect the log to be
+ * some variation on
+ * x + 1 ... | x ... | x
+ * The first block with cycle number x (last_half_cycle) will
+ * be where the new head belongs. First we do a binary search
+ * for the first occurrence of last_half_cycle. The binary
+ * search may not be totally accurate, so then we scan back
+ * from there looking for occurrences of last_half_cycle before
+ * us. If that backwards scan wraps around the beginning of
+ * the log, then we look for occurrences of last_half_cycle - 1
+ * at the end of the log. The cases we're looking for look
+ * like
+ * v binary search stopped here
+ * x + 1 ... | x | x + 1 | x ... | x
+ * ^ but we want to locate this spot
+ * or
+ * <---------> less than scan distance
+ * x + 1 ... | x ... | x - 1 | x
+ * ^ we want to locate this spot
+ */
+ stop_on_cycle = last_half_cycle;
+ if ((error = xlog_find_cycle_start(log, bp, first_blk,
+ &head_blk, last_half_cycle)))
+ goto bp_err;
+ }
+
+ /*
+ * Now validate the answer. Scan back some number of maximum possible
+ * blocks and make sure each one has the expected cycle number. The
+ * maximum is determined by the total possible amount of buffering
+ * in the in-core log. The following number can be made tighter if
+ * we actually look at the block size of the filesystem.
+ */
+ num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+ if (head_blk >= num_scan_bblks) {
+ /*
+ * We are guaranteed that the entire check can be performed
+ * in one buffer.
+ */
+ start_blk = head_blk - num_scan_bblks;
+ if ((error = xlog_find_verify_cycle(log,
+ start_blk, num_scan_bblks,
+ stop_on_cycle, &new_blk)))
+ goto bp_err;
+ if (new_blk != -1)
+ head_blk = new_blk;
+ } else { /* need to read 2 parts of log */
+ /*
+ * We are going to scan backwards in the log in two parts.
+ * First we scan the physical end of the log. In this part
+ * of the log, we are looking for blocks with cycle number
+ * last_half_cycle - 1.
+ * If we find one, then we know that the log starts there, as
+ * we've found a hole that didn't get written in going around
+ * the end of the physical log. The simple case for this is
+ * x + 1 ... | x ... | x - 1 | x
+ * <---------> less than scan distance
+ * If all of the blocks at the end of the log have cycle number
+ * last_half_cycle, then we check the blocks at the start of
+ * the log looking for occurrences of last_half_cycle. If we
+ * find one, then our current estimate for the location of the
+ * first occurrence of last_half_cycle is wrong and we move
+ * back to the hole we've found. This case looks like
+ * x + 1 ... | x | x + 1 | x ...
+ * ^ binary search stopped here
+ * Another case we need to handle that only occurs in 256k
+ * logs is
+ * x + 1 ... | x ... | x+1 | x ...
+ * ^ binary search stops here
+ * In a 256k log, the scan at the end of the log will see the
+ * x + 1 blocks. We need to skip past those since that is
+ * certainly not the head of the log. By searching for
+ * last_half_cycle-1 we accomplish that.
+ */
+ ASSERT(head_blk <= INT_MAX &&
+ (xfs_daddr_t) num_scan_bblks >= head_blk);
+ start_blk = log_bbnum - (num_scan_bblks - head_blk);
+ if ((error = xlog_find_verify_cycle(log, start_blk,
+ num_scan_bblks - (int)head_blk,
+ (stop_on_cycle - 1), &new_blk)))
+ goto bp_err;
+ if (new_blk != -1) {
+ head_blk = new_blk;
+ goto validate_head;
+ }
+
+ /*
+ * Scan beginning of log now. The last part of the physical
+ * log is good. This scan needs to verify that it doesn't find
+ * the last_half_cycle.
+ */
+ start_blk = 0;
+ ASSERT(head_blk <= INT_MAX);
+ if ((error = xlog_find_verify_cycle(log,
+ start_blk, (int)head_blk,
+ stop_on_cycle, &new_blk)))
+ goto bp_err;
+ if (new_blk != -1)
+ head_blk = new_blk;
+ }
+
+validate_head:
+ /*
+ * Now we need to make sure head_blk is not pointing to a block in
+ * the middle of a log record.
+ */
+ num_scan_bblks = XLOG_REC_SHIFT(log);
+ if (head_blk >= num_scan_bblks) {
+ start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
+
+ /* start ptr at last block ptr before head_blk */
+ if ((error = xlog_find_verify_log_record(log, start_blk,
+ &head_blk, 0)) == -1) {
+ error = XFS_ERROR(EIO);
+ goto bp_err;
+ } else if (error)
+ goto bp_err;
+ } else {
+ start_blk = 0;
+ ASSERT(head_blk <= INT_MAX);
+ if ((error = xlog_find_verify_log_record(log, start_blk,
+ &head_blk, 0)) == -1) {
+ /* We hit the beginning of the log during our search */
+ start_blk = log_bbnum - (num_scan_bblks - head_blk);
+ new_blk = log_bbnum;
+ ASSERT(start_blk <= INT_MAX &&
+ (xfs_daddr_t) log_bbnum-start_blk >= 0);
+ ASSERT(head_blk <= INT_MAX);
+ if ((error = xlog_find_verify_log_record(log,
+ start_blk, &new_blk,
+ (int)head_blk)) == -1) {
+ error = XFS_ERROR(EIO);
+ goto bp_err;
+ } else if (error)
+ goto bp_err;
+ if (new_blk != log_bbnum)
+ head_blk = new_blk;
+ } else if (error)
+ goto bp_err;
+ }
+
+ xlog_put_bp(bp);
+ if (head_blk == log_bbnum)
+ *return_head_blk = 0;
+ else
+ *return_head_blk = head_blk;
+ /*
+ * When returning here, we have a good block number. Bad block
+ * means that during a previous crash, we didn't have a clean break
+ * from cycle number N to cycle number N-1. In this case, we need
+ * to find the first block with cycle number N-1.
+ */
+ return 0;
+
+ bp_err:
+ xlog_put_bp(bp);
+
+ if (error)
+ xfs_warn(log->l_mp, "failed to find log head");
+ return error;
+}
+
+/*
+ * Find the sync block number or the tail of the log.
+ *
+ * This will be the block number of the last record to have its
+ * associated buffers synced to disk. Every log record header has
+ * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
+ * to get a sync block number. The only concern is to figure out which
+ * log record header to believe.
+ *
+ * The following algorithm uses the log record header with the largest
+ * lsn. The entire log record does not need to be valid. We only care
+ * that the header is valid.
+ *
+ * We could speed up search by using current head_blk buffer, but it is not
+ * available.
+ */
+STATIC int
+xlog_find_tail(
+ xlog_t *log,
+ xfs_daddr_t *head_blk,
+ xfs_daddr_t *tail_blk)
+{
+ xlog_rec_header_t *rhead;
+ xlog_op_header_t *op_head;
+ xfs_caddr_t offset = NULL;
+ xfs_buf_t *bp;
+ int error, i, found;
+ xfs_daddr_t umount_data_blk;
+ xfs_daddr_t after_umount_blk;
+ xfs_lsn_t tail_lsn;
+ int hblks;
+
+ found = 0;
+
+ /*
+ * Find previous log record
+ */
+ if ((error = xlog_find_head(log, head_blk)))
+ return error;
+
+ bp = xlog_get_bp(log, 1);
+ if (!bp)
+ return ENOMEM;
+ if (*head_blk == 0) { /* special case */
+ error = xlog_bread(log, 0, 1, bp, &offset);
+ if (error)
+ goto done;
+
+ if (xlog_get_cycle(offset) == 0) {
+ *tail_blk = 0;
+ /* leave all other log inited values alone */
+ goto done;
+ }
+ }
+
+ /*
+ * Search backwards looking for log record header block
+ */
+ ASSERT(*head_blk < INT_MAX);
+ for (i = (int)(*head_blk) - 1; i >= 0; i--) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto done;
+
+ if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
+ found = 1;
+ break;
+ }
+ }
+ /*
+ * If we haven't found the log record header block, start looking
+ * again from the end of the physical log. XXXmiken: There should be
+ * a check here to make sure we didn't search more than N blocks in
+ * the previous code.
+ */
+ if (!found) {
+ for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto done;
+
+ if (XLOG_HEADER_MAGIC_NUM ==
+ be32_to_cpu(*(__be32 *)offset)) {
+ found = 2;
+ break;
+ }
+ }
+ }
+ if (!found) {
+ xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+ ASSERT(0);
+ return XFS_ERROR(EIO);
+ }
+
+ /* find blk_no of tail of log */
+ rhead = (xlog_rec_header_t *)offset;
+ *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
+
+ /*
+ * Reset log values according to the state of the log when we
+ * crashed. In the case where head_blk == 0, we bump curr_cycle
+ * one because the next write starts a new cycle rather than
+ * continuing the cycle of the last good log record. At this
+ * point we have guaranteed that all partial log records have been
+ * accounted for. Therefore, we know that the last good log record
+ * written was complete and ended exactly on the end boundary
+ * of the physical log.
+ */
+ log->l_prev_block = i;
+ log->l_curr_block = (int)*head_blk;
+ log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+ if (found == 2)
+ log->l_curr_cycle++;
+ atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+ atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+ xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
+ xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
+
+ /*
+ * Look for unmount record. If we find it, then we know there
+ * was a clean unmount. Since 'i' could be the last block in
+ * the physical log, we convert to a log block before comparing
+ * to the head_blk.
+ *
+ * Save the current tail lsn to use to pass to
+ * xlog_clear_stale_blocks() below. We won't want to clear the
+ * unmount record if there is one, so we pass the lsn of the
+ * unmount record rather than the block after it.
+ */
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ int h_size = be32_to_cpu(rhead->h_size);
+ int h_version = be32_to_cpu(rhead->h_version);
+
+ if ((h_version & XLOG_VERSION_2) &&
+ (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+ hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+ if (h_size % XLOG_HEADER_CYCLE_SIZE)
+ hblks++;
+ } else {
+ hblks = 1;
+ }
+ } else {
+ hblks = 1;
+ }
+ after_umount_blk = (i + hblks + (int)
+ BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
+ tail_lsn = atomic64_read(&log->l_tail_lsn);
+ if (*head_blk == after_umount_blk &&
+ be32_to_cpu(rhead->h_num_logops) == 1) {
+ umount_data_blk = (i + hblks) % log->l_logBBsize;
+ error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+ if (error)
+ goto done;
+
+ op_head = (xlog_op_header_t *)offset;
+ if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+ /*
+ * Set tail and last sync so that newly written
+ * log records will point recovery to after the
+ * current unmount record.
+ */
+ xlog_assign_atomic_lsn(&log->l_tail_lsn,
+ log->l_curr_cycle, after_umount_blk);
+ xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+ log->l_curr_cycle, after_umount_blk);
+ *tail_blk = after_umount_blk;
+
+ /*
+ * Note that the unmount was clean. If the unmount
+ * was not clean, we need to know this to rebuild the
+ * superblock counters from the perag headers if we
+ * have a filesystem using non-persistent counters.
+ */
+ log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+ }
+ }
+
+ /*
+ * Make sure that there are no blocks in front of the head
+ * with the same cycle number as the head. This can happen
+ * because we allow multiple outstanding log writes concurrently,
+ * and the later writes might make it out before earlier ones.
+ *
+ * We use the lsn from before modifying it so that we'll never
+ * overwrite the unmount record after a clean unmount.
+ *
+ * Do this only if we are going to recover the filesystem
+ *
+ * NOTE: This used to say "if (!readonly)"
+ * However on Linux, we can & do recover a read-only filesystem.
+ * We only skip recovery if NORECOVERY is specified on mount,
+ * in which case we would not be here.
+ *
+ * But... if the -device- itself is readonly, just skip this.
+ * We can't recover this device anyway, so it won't matter.
+ */
+ if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
+ error = xlog_clear_stale_blocks(log, tail_lsn);
+
+done:
+ xlog_put_bp(bp);
+
+ if (error)
+ xfs_warn(log->l_mp, "failed to locate log tail");
+ return error;
+}
+
+/*
+ * Is the log zeroed at all?
+ *
+ * The last binary search should be changed to perform an X block read
+ * once X becomes small enough. You can then search linearly through
+ * the X blocks. This will cut down on the number of reads we need to do.
+ *
+ * If the log is partially zeroed, this routine will pass back the blkno
+ * of the first block with cycle number 0. It won't have a complete LR
+ * preceding it.
+ *
+ * Return:
+ * 0 => the log is completely written to
+ * -1 => use *blk_no as the first block of the log
+ * >0 => error has occurred
+ */
+STATIC int
+xlog_find_zeroed(
+ xlog_t *log,
+ xfs_daddr_t *blk_no)
+{
+ xfs_buf_t *bp;
+ xfs_caddr_t offset;
+ uint first_cycle, last_cycle;
+ xfs_daddr_t new_blk, last_blk, start_blk;
+ xfs_daddr_t num_scan_bblks;
+ int error, log_bbnum = log->l_logBBsize;
+
+ *blk_no = 0;
+
+ /* check totally zeroed log */
+ bp = xlog_get_bp(log, 1);
+ if (!bp)
+ return ENOMEM;
+ error = xlog_bread(log, 0, 1, bp, &offset);
+ if (error)
+ goto bp_err;
+
+ first_cycle = xlog_get_cycle(offset);
+ if (first_cycle == 0) { /* completely zeroed log */
+ *blk_no = 0;
+ xlog_put_bp(bp);
+ return -1;
+ }
+
+ /* check partially zeroed log */
+ error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
+ if (error)
+ goto bp_err;
+
+ last_cycle = xlog_get_cycle(offset);
+ if (last_cycle != 0) { /* log completely written to */
+ xlog_put_bp(bp);
+ return 0;
+ } else if (first_cycle != 1) {
+ /*
+ * If the cycle of the last block is zero, the cycle of
+ * the first block must be 1. If it's not, maybe we're
+ * not looking at a log... Bail out.
+ */
+ xfs_warn(log->l_mp,
+ "Log inconsistent or not a log (last==0, first!=1)");
+ return XFS_ERROR(EINVAL);
+ }
+
+ /* we have a partially zeroed log */
+ last_blk = log_bbnum-1;
+ if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
+ goto bp_err;
+
+ /*
+ * Validate the answer. Because there is no way to guarantee that
+ * the entire log is made up of log records which are the same size,
+ * we scan over the defined maximum blocks. At this point, the maximum
+ * is not chosen to mean anything special. XXXmiken
+ */
+ num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+ ASSERT(num_scan_bblks <= INT_MAX);
+
+ if (last_blk < num_scan_bblks)
+ num_scan_bblks = last_blk;
+ start_blk = last_blk - num_scan_bblks;
+
+ /*
+ * We search for any instances of cycle number 0 that occur before
+ * our current estimate of the head. What we're trying to detect is
+ * 1 ... | 0 | 1 | 0...
+ * ^ binary search ends here
+ */
+ if ((error = xlog_find_verify_cycle(log, start_blk,
+ (int)num_scan_bblks, 0, &new_blk)))
+ goto bp_err;
+ if (new_blk != -1)
+ last_blk = new_blk;
+
+ /*
+ * Potentially backup over partial log record write. We don't need
+ * to search the end of the log because we know it is zero.
+ */
+ if ((error = xlog_find_verify_log_record(log, start_blk,
+ &last_blk, 0)) == -1) {
+ error = XFS_ERROR(EIO);
+ goto bp_err;
+ } else if (error)
+ goto bp_err;
+
+ *blk_no = last_blk;
+bp_err:
+ xlog_put_bp(bp);
+ if (error)
+ return error;
+ return -1;
+}
+
+/*
+ * These are simple subroutines used by xlog_clear_stale_blocks() below
+ * to initialize a buffer full of empty log record headers and write
+ * them into the log.
+ */
+STATIC void
+xlog_add_record(
+ xlog_t *log,
+ xfs_caddr_t buf,
+ int cycle,
+ int block,
+ int tail_cycle,
+ int tail_block)
+{
+ xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
+
+ memset(buf, 0, BBSIZE);
+ recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
+ recp->h_cycle = cpu_to_be32(cycle);
+ recp->h_version = cpu_to_be32(
+ xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
+ recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
+ recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
+ recp->h_fmt = cpu_to_be32(XLOG_FMT);
+ memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
+}
+
+STATIC int
+xlog_write_log_records(
+ xlog_t *log,
+ int cycle,
+ int start_block,
+ int blocks,
+ int tail_cycle,
+ int tail_block)
+{
+ xfs_caddr_t offset;
+ xfs_buf_t *bp;
+ int balign, ealign;
+ int sectbb = log->l_sectBBsize;
+ int end_block = start_block + blocks;
+ int bufblks;
+ int error = 0;
+ int i, j = 0;
+
+ /*
+ * Greedily allocate a buffer big enough to handle the full
+ * range of basic blocks to be written. If that fails, try
+ * a smaller size. We need to be able to write at least a
+ * log sector, or we're out of luck.
+ */
+ bufblks = 1 << ffs(blocks);
+ while (!(bp = xlog_get_bp(log, bufblks))) {
+ bufblks >>= 1;
+ if (bufblks < sectbb)
+ return ENOMEM;
+ }
+
+ /* We may need to do a read at the start to fill in part of
+ * the buffer in the starting sector not covered by the first
+ * write below.
+ */
+ balign = round_down(start_block, sectbb);
+ if (balign != start_block) {
+ error = xlog_bread_noalign(log, start_block, 1, bp);
+ if (error)
+ goto out_put_bp;
+
+ j = start_block - balign;
+ }
+
+ for (i = start_block; i < end_block; i += bufblks) {
+ int bcount, endcount;
+
+ bcount = min(bufblks, end_block - start_block);
+ endcount = bcount - j;
+
+ /* We may need to do a read at the end to fill in part of
+ * the buffer in the final sector not covered by the write.
+ * If this is the same sector as the above read, skip it.
+ */
+ ealign = round_down(end_block, sectbb);
+ if (j == 0 && (start_block + endcount > ealign)) {
+ offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
+ error = xlog_bread_offset(log, ealign, sectbb,
+ bp, offset);
+ if (error)
+ break;
+
+ }
+
+ offset = xlog_align(log, start_block, endcount, bp);
+ for (; j < endcount; j++) {
+ xlog_add_record(log, offset, cycle, i+j,
+ tail_cycle, tail_block);
+ offset += BBSIZE;
+ }
+ error = xlog_bwrite(log, start_block, endcount, bp);
+ if (error)
+ break;
+ start_block += endcount;
+ j = 0;
+ }
+
+ out_put_bp:
+ xlog_put_bp(bp);
+ return error;
+}
+
+/*
+ * This routine is called to blow away any incomplete log writes out
+ * in front of the log head. We do this so that we won't become confused
+ * if we come up, write only a little bit more, and then crash again.
+ * If we leave the partial log records out there, this situation could
+ * cause us to think those partial writes are valid blocks since they
+ * have the current cycle number. We get rid of them by overwriting them
+ * with empty log records with the old cycle number rather than the
+ * current one.
+ *
+ * The tail lsn is passed in rather than taken from
+ * the log so that we will not write over the unmount record after a
+ * clean unmount in a 512 block log. Doing so would leave the log without
+ * any valid log records in it until a new one was written. If we crashed
+ * during that time we would not be able to recover.
+ */
+STATIC int
+xlog_clear_stale_blocks(
+ xlog_t *log,
+ xfs_lsn_t tail_lsn)
+{
+ int tail_cycle, head_cycle;
+ int tail_block, head_block;
+ int tail_distance, max_distance;
+ int distance;
+ int error;
+
+ tail_cycle = CYCLE_LSN(tail_lsn);
+ tail_block = BLOCK_LSN(tail_lsn);
+ head_cycle = log->l_curr_cycle;
+ head_block = log->l_curr_block;
+
+ /*
+ * Figure out the distance between the new head of the log
+ * and the tail. We want to write over any blocks beyond the
+ * head that we may have written just before the crash, but
+ * we don't want to overwrite the tail of the log.
+ */
+ if (head_cycle == tail_cycle) {
+ /*
+ * The tail is behind the head in the physical log,
+ * so the distance from the head to the tail is the
+ * distance from the head to the end of the log plus
+ * the distance from the beginning of the log to the
+ * tail.
+ */
+ if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
+ XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
+ XFS_ERRLEVEL_LOW, log->l_mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ tail_distance = tail_block + (log->l_logBBsize - head_block);
+ } else {
+ /*
+ * The head is behind the tail in the physical log,
+ * so the distance from the head to the tail is just
+ * the tail block minus the head block.
+ */
+ if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
+ XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
+ XFS_ERRLEVEL_LOW, log->l_mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ tail_distance = tail_block - head_block;
+ }
+
+ /*
+ * If the head is right up against the tail, we can't clear
+ * anything.
+ */
+ if (tail_distance <= 0) {
+ ASSERT(tail_distance == 0);
+ return 0;
+ }
+
+ max_distance = XLOG_TOTAL_REC_SHIFT(log);
+ /*
+ * Take the smaller of the maximum amount of outstanding I/O
+ * we could have and the distance to the tail to clear out.
+ * We take the smaller so that we don't overwrite the tail and
+ * we don't waste all day writing from the head to the tail
+ * for no reason.
+ */
+ max_distance = MIN(max_distance, tail_distance);
+
+ if ((head_block + max_distance) <= log->l_logBBsize) {
+ /*
+ * We can stomp all the blocks we need to without
+ * wrapping around the end of the log. Just do it
+ * in a single write. Use the cycle number of the
+ * current cycle minus one so that the log will look like:
+ * n ... | n - 1 ...
+ */
+ error = xlog_write_log_records(log, (head_cycle - 1),
+ head_block, max_distance, tail_cycle,
+ tail_block);
+ if (error)
+ return error;
+ } else {
+ /*
+ * We need to wrap around the end of the physical log in
+ * order to clear all the blocks. Do it in two separate
+ * I/Os. The first write should be from the head to the
+ * end of the physical log, and it should use the current
+ * cycle number minus one just like above.
+ */
+ distance = log->l_logBBsize - head_block;
+ error = xlog_write_log_records(log, (head_cycle - 1),
+ head_block, distance, tail_cycle,
+ tail_block);
+
+ if (error)
+ return error;
+
+ /*
+ * Now write the blocks at the start of the physical log.
+ * This writes the remainder of the blocks we want to clear.
+ * It uses the current cycle number since we're now on the
+ * same cycle as the head so that we get:
+ * n ... n ... | n - 1 ...
+ * ^^^^^ blocks we're writing
+ */
+ distance = max_distance - (log->l_logBBsize - head_block);
+ error = xlog_write_log_records(log, head_cycle, 0, distance,
+ tail_cycle, tail_block);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/******************************************************************************
+ *
+ * Log recover routines
+ *
+ ******************************************************************************
+ */
+
+STATIC xlog_recover_t *
+xlog_recover_find_tid(
+ struct hlist_head *head,
+ xlog_tid_t tid)
+{
+ xlog_recover_t *trans;
+ struct hlist_node *n;
+
+ hlist_for_each_entry(trans, n, head, r_list) {
+ if (trans->r_log_tid == tid)
+ return trans;
+ }
+ return NULL;
+}
+
+STATIC void
+xlog_recover_new_tid(
+ struct hlist_head *head,
+ xlog_tid_t tid,
+ xfs_lsn_t lsn)
+{
+ xlog_recover_t *trans;
+
+ trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
+ trans->r_log_tid = tid;
+ trans->r_lsn = lsn;
+ INIT_LIST_HEAD(&trans->r_itemq);
+
+ INIT_HLIST_NODE(&trans->r_list);
+ hlist_add_head(&trans->r_list, head);
+}
+
+STATIC void
+xlog_recover_add_item(
+ struct list_head *head)
+{
+ xlog_recover_item_t *item;
+
+ item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+ INIT_LIST_HEAD(&item->ri_list);
+ list_add_tail(&item->ri_list, head);
+}
+
+STATIC int
+xlog_recover_add_to_cont_trans(
+ struct log *log,
+ xlog_recover_t *trans,
+ xfs_caddr_t dp,
+ int len)
+{
+ xlog_recover_item_t *item;
+ xfs_caddr_t ptr, old_ptr;
+ int old_len;
+
+ if (list_empty(&trans->r_itemq)) {
+ /* finish copying rest of trans header */
+ xlog_recover_add_item(&trans->r_itemq);
+ ptr = (xfs_caddr_t) &trans->r_theader +
+ sizeof(xfs_trans_header_t) - len;
+ memcpy(ptr, dp, len); /* d, s, l */
+ return 0;
+ }
+ /* take the tail entry */
+ item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+
+ old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
+ old_len = item->ri_buf[item->ri_cnt-1].i_len;
+
+ ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
+ memcpy(&ptr[old_len], dp, len); /* d, s, l */
+ item->ri_buf[item->ri_cnt-1].i_len += len;
+ item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+ trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
+ return 0;
+}
+
+/*
+ * The next region to add is the start of a new region. It could be
+ * a whole region or it could be the first part of a new region. Because
+ * of this, the assumption here is that the type and size fields of all
+ * format structures fit into the first 32 bits of the structure.
+ *
+ * This works because all regions must be 32 bit aligned. Therefore, we
+ * either have both fields or we have neither field. In the case we have
+ * neither field, the data part of the region is zero length. We only have
+ * a log_op_header and can throw away the header since a new one will appear
+ * later. If we have at least 4 bytes, then we can determine how many regions
+ * will appear in the current log item.
+ */
+STATIC int
+xlog_recover_add_to_trans(
+ struct log *log,
+ xlog_recover_t *trans,
+ xfs_caddr_t dp,
+ int len)
+{
+ xfs_inode_log_format_t *in_f; /* any will do */
+ xlog_recover_item_t *item;
+ xfs_caddr_t ptr;
+
+ if (!len)
+ return 0;
+ if (list_empty(&trans->r_itemq)) {
+ /* we need to catch log corruptions here */
+ if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+ xfs_warn(log->l_mp, "%s: bad header magic number",
+ __func__);
+ ASSERT(0);
+ return XFS_ERROR(EIO);
+ }
+ if (len == sizeof(xfs_trans_header_t))
+ xlog_recover_add_item(&trans->r_itemq);
+ memcpy(&trans->r_theader, dp, len); /* d, s, l */
+ return 0;
+ }
+
+ ptr = kmem_alloc(len, KM_SLEEP);
+ memcpy(ptr, dp, len);
+ in_f = (xfs_inode_log_format_t *)ptr;
+
+ /* take the tail entry */
+ item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+ if (item->ri_total != 0 &&
+ item->ri_total == item->ri_cnt) {
+ /* tail item is in use, get a new one */
+ xlog_recover_add_item(&trans->r_itemq);
+ item = list_entry(trans->r_itemq.prev,
+ xlog_recover_item_t, ri_list);
+ }
+
+ if (item->ri_total == 0) { /* first region to be added */
+ if (in_f->ilf_size == 0 ||
+ in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
+ xfs_warn(log->l_mp,
+ "bad number of regions (%d) in inode log format",
+ in_f->ilf_size);
+ ASSERT(0);
+ return XFS_ERROR(EIO);
+ }
+
+ item->ri_total = in_f->ilf_size;
+ item->ri_buf =
+ kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+ KM_SLEEP);
+ }
+ ASSERT(item->ri_total > item->ri_cnt);
+ /* Description region is ri_buf[0] */
+ item->ri_buf[item->ri_cnt].i_addr = ptr;
+ item->ri_buf[item->ri_cnt].i_len = len;
+ item->ri_cnt++;
+ trace_xfs_log_recover_item_add(log, trans, item, 0);
+ return 0;
+}
+
+/*
+ * Sort the log items in the transaction. Cancelled buffers need
+ * to be put first so they are processed before any items that might
+ * modify the buffers. If they are cancelled, then the modifications
+ * don't need to be replayed.
+ */
+STATIC int
+xlog_recover_reorder_trans(
+ struct log *log,
+ xlog_recover_t *trans,
+ int pass)
+{
+ xlog_recover_item_t *item, *n;
+ LIST_HEAD(sort_list);
+
+ list_splice_init(&trans->r_itemq, &sort_list);
+ list_for_each_entry_safe(item, n, &sort_list, ri_list) {
+ xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
+
+ switch (ITEM_TYPE(item)) {
+ case XFS_LI_BUF:
+ if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+ trace_xfs_log_recover_item_reorder_head(log,
+ trans, item, pass);
+ list_move(&item->ri_list, &trans->r_itemq);
+ break;
+ }
+ case XFS_LI_INODE:
+ case XFS_LI_DQUOT:
+ case XFS_LI_QUOTAOFF:
+ case XFS_LI_EFD:
+ case XFS_LI_EFI:
+ trace_xfs_log_recover_item_reorder_tail(log,
+ trans, item, pass);
+ list_move_tail(&item->ri_list, &trans->r_itemq);
+ break;
+ default:
+ xfs_warn(log->l_mp,
+ "%s: unrecognized type of log operation",
+ __func__);
+ ASSERT(0);
+ return XFS_ERROR(EIO);
+ }
+ }
+ ASSERT(list_empty(&sort_list));
+ return 0;
+}
+
+/*
+ * Build up the table of buf cancel records so that we don't replay
+ * cancelled data in the second pass. For buffer records that are
+ * not cancel records, there is nothing to do here so we just return.
+ *
+ * If we get a cancel record which is already in the table, this indicates
+ * that the buffer was cancelled multiple times. In order to ensure
+ * that during pass 2 we keep the record in the table until we reach its
+ * last occurrence in the log, we keep a reference count in the cancel
+ * record in the table to tell us how many times we expect to see this
+ * record during the second pass.
+ */
+STATIC int
+xlog_recover_buffer_pass1(
+ struct log *log,
+ xlog_recover_item_t *item)
+{
+ xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
+ struct list_head *bucket;
+ struct xfs_buf_cancel *bcp;
+
+ /*
+ * If this isn't a cancel buffer item, then just return.
+ */
+ if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+ trace_xfs_log_recover_buf_not_cancel(log, buf_f);
+ return 0;
+ }
+
+ /*
+ * Insert an xfs_buf_cancel record into the hash table of them.
+ * If there is already an identical record, bump its reference count.
+ */
+ bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
+ list_for_each_entry(bcp, bucket, bc_list) {
+ if (bcp->bc_blkno == buf_f->blf_blkno &&
+ bcp->bc_len == buf_f->blf_len) {
+ bcp->bc_refcount++;
+ trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
+ return 0;
+ }
+ }
+
+ bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+ bcp->bc_blkno = buf_f->blf_blkno;
+ bcp->bc_len = buf_f->blf_len;
+ bcp->bc_refcount = 1;
+ list_add_tail(&bcp->bc_list, bucket);
+
+ trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+ return 0;
+}
+
+/*
+ * Check to see whether the buffer being recovered has a corresponding
+ * entry in the buffer cancel record table. If it does then return 1
+ * so that it will be cancelled, otherwise return 0. If the buffer is
+ * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
+ * the refcount on the entry in the table and remove it from the table
+ * if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its
+ * last occurrence in the log so that if the same buffer is re-used
+ * again after its last cancellation we actually replay the changes
+ * made at that point.
+ */
+STATIC int
+xlog_check_buffer_cancelled(
+ struct log *log,
+ xfs_daddr_t blkno,
+ uint len,
+ ushort flags)
+{
+ struct list_head *bucket;
+ struct xfs_buf_cancel *bcp;
+
+ if (log->l_buf_cancel_table == NULL) {
+ /*
+ * There is nothing in the table built in pass one,
+ * so this buffer must not be cancelled.
+ */
+ ASSERT(!(flags & XFS_BLF_CANCEL));
+ return 0;
+ }
+
+ /*
+ * Search for an entry in the cancel table that matches our buffer.
+ */
+ bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
+ list_for_each_entry(bcp, bucket, bc_list) {
+ if (bcp->bc_blkno == blkno && bcp->bc_len == len)
+ goto found;
+ }
+
+ /*
+ * We didn't find a corresponding entry in the table, so return 0 so
+ * that the buffer is NOT cancelled.
+ */
+ ASSERT(!(flags & XFS_BLF_CANCEL));
+ return 0;
+
+found:
+ /*
+ * We've go a match, so return 1 so that the recovery of this buffer
+ * is cancelled. If this buffer is actually a buffer cancel log
+ * item, then decrement the refcount on the one in the table and
+ * remove it if this is the last reference.
+ */
+ if (flags & XFS_BLF_CANCEL) {
+ if (--bcp->bc_refcount == 0) {
+ list_del(&bcp->bc_list);
+ kmem_free(bcp);
+ }
+ }
+ return 1;
+}
+
+/*
+ * Perform recovery for a buffer full of inodes. In these buffers, the only
+ * data which should be recovered is that which corresponds to the
+ * di_next_unlinked pointers in the on disk inode structures. The rest of the
+ * data for the inodes is always logged through the inodes themselves rather
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
+ *
+ * The only time when buffers full of inodes are fully recovered is when the
+ * buffer is full of newly allocated inodes. In this case the buffer will
+ * not be marked as an inode buffer and so will be sent to
+ * xlog_recover_do_reg_buffer() below during recovery.
+ */
+STATIC int
+xlog_recover_do_inode_buffer(
+ struct xfs_mount *mp,
+ xlog_recover_item_t *item,
+ struct xfs_buf *bp,
+ xfs_buf_log_format_t *buf_f)
+{
+ int i;
+ int item_index = 0;
+ int bit = 0;
+ int nbits = 0;
+ int reg_buf_offset = 0;
+ int reg_buf_bytes = 0;
+ int next_unlinked_offset;
+ int inodes_per_buf;
+ xfs_agino_t *logged_nextp;
+ xfs_agino_t *buffer_nextp;
+
+ trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
+
+ inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
+ for (i = 0; i < inodes_per_buf; i++) {
+ next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
+ offsetof(xfs_dinode_t, di_next_unlinked);
+
+ while (next_unlinked_offset >=
+ (reg_buf_offset + reg_buf_bytes)) {
+ /*
+ * The next di_next_unlinked field is beyond
+ * the current logged region. Find the next
+ * logged region that contains or is beyond
+ * the current di_next_unlinked field.
+ */
+ bit += nbits;
+ bit = xfs_next_bit(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+
+ /*
+ * If there are no more logged regions in the
+ * buffer, then we're done.
+ */
+ if (bit == -1)
+ return 0;
+
+ nbits = xfs_contig_bits(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+ ASSERT(nbits > 0);
+ reg_buf_offset = bit << XFS_BLF_SHIFT;
+ reg_buf_bytes = nbits << XFS_BLF_SHIFT;
+ item_index++;
+ }
+
+ /*
+ * If the current logged region starts after the current
+ * di_next_unlinked field, then move on to the next
+ * di_next_unlinked field.
+ */
+ if (next_unlinked_offset < reg_buf_offset)
+ continue;
+
+ ASSERT(item->ri_buf[item_index].i_addr != NULL);
+ ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
+ ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
+
+ /*
+ * The current logged region contains a copy of the
+ * current di_next_unlinked field. Extract its value
+ * and copy it to the buffer copy.
+ */
+ logged_nextp = item->ri_buf[item_index].i_addr +
+ next_unlinked_offset - reg_buf_offset;
+ if (unlikely(*logged_nextp == 0)) {
+ xfs_alert(mp,
+ "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
+ "Trying to replay bad (0) inode di_next_unlinked field.",
+ item, bp);
+ XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
+ XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
+ next_unlinked_offset);
+ *buffer_nextp = *logged_nextp;
+ }
+
+ return 0;
+}
+
+/*
+ * Perform a 'normal' buffer recovery. Each logged region of the
+ * buffer should be copied over the corresponding region in the
+ * given buffer. The bitmap in the buf log format structure indicates
+ * where to place the logged data.
+ */
+STATIC void
+xlog_recover_do_reg_buffer(
+ struct xfs_mount *mp,
+ xlog_recover_item_t *item,
+ struct xfs_buf *bp,
+ xfs_buf_log_format_t *buf_f)
+{
+ int i;
+ int bit;
+ int nbits;
+ int error;
+
+ trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
+
+ bit = 0;
+ i = 1; /* 0 is the buf format structure */
+ while (1) {
+ bit = xfs_next_bit(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+ if (bit == -1)
+ break;
+ nbits = xfs_contig_bits(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+ ASSERT(nbits > 0);
+ ASSERT(item->ri_buf[i].i_addr != NULL);
+ ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
+ ASSERT(XFS_BUF_COUNT(bp) >=
+ ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
+
+ /*
+ * Do a sanity check if this is a dquot buffer. Just checking
+ * the first dquot in the buffer should do. XXXThis is
+ * probably a good thing to do for other buf types also.
+ */
+ error = 0;
+ if (buf_f->blf_flags &
+ (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+ if (item->ri_buf[i].i_addr == NULL) {
+ xfs_alert(mp,
+ "XFS: NULL dquot in %s.", __func__);
+ goto next;
+ }
+ if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
+ xfs_alert(mp,
+ "XFS: dquot too small (%d) in %s.",
+ item->ri_buf[i].i_len, __func__);
+ goto next;
+ }
+ error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
+ -1, 0, XFS_QMOPT_DOWARN,
+ "dquot_buf_recover");
+ if (error)
+ goto next;
+ }
+
+ memcpy(xfs_buf_offset(bp,
+ (uint)bit << XFS_BLF_SHIFT), /* dest */
+ item->ri_buf[i].i_addr, /* source */
+ nbits<<XFS_BLF_SHIFT); /* length */
+ next:
+ i++;
+ bit += nbits;
+ }
+
+ /* Shouldn't be any more regions */
+ ASSERT(i == item->ri_total);
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+int
+xfs_qm_dqcheck(
+ struct xfs_mount *mp,
+ xfs_disk_dquot_t *ddq,
+ xfs_dqid_t id,
+ uint type, /* used only when IO_dorepair is true */
+ uint flags,
+ char *str)
+{
+ xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
+ int errs = 0;
+
+ /*
+ * We can encounter an uninitialized dquot buffer for 2 reasons:
+ * 1. If we crash while deleting the quotainode(s), and those blks got
+ * used for user data. This is because we take the path of regular
+ * file deletion; however, the size field of quotainodes is never
+ * updated, so all the tricks that we play in itruncate_finish
+ * don't quite matter.
+ *
+ * 2. We don't play the quota buffers when there's a quotaoff logitem.
+ * But the allocation will be replayed so we'll end up with an
+ * uninitialized quota block.
+ *
+ * This is all fine; things are still consistent, and we haven't lost
+ * any quota information. Just don't complain about bad dquot blks.
+ */
+ if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+ str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
+ errs++;
+ }
+ if (ddq->d_version != XFS_DQUOT_VERSION) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+ str, id, ddq->d_version, XFS_DQUOT_VERSION);
+ errs++;
+ }
+
+ if (ddq->d_flags != XFS_DQ_USER &&
+ ddq->d_flags != XFS_DQ_PROJ &&
+ ddq->d_flags != XFS_DQ_GROUP) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+ str, id, ddq->d_flags);
+ errs++;
+ }
+
+ if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : ondisk-dquot 0x%p, ID mismatch: "
+ "0x%x expected, found id 0x%x",
+ str, ddq, id, be32_to_cpu(ddq->d_id));
+ errs++;
+ }
+
+ if (!errs && ddq->d_id) {
+ if (ddq->d_blk_softlimit &&
+ be64_to_cpu(ddq->d_bcount) >=
+ be64_to_cpu(ddq->d_blk_softlimit)) {
+ if (!ddq->d_btimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ if (ddq->d_ino_softlimit &&
+ be64_to_cpu(ddq->d_icount) >=
+ be64_to_cpu(ddq->d_ino_softlimit)) {
+ if (!ddq->d_itimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ if (ddq->d_rtb_softlimit &&
+ be64_to_cpu(ddq->d_rtbcount) >=
+ be64_to_cpu(ddq->d_rtb_softlimit)) {
+ if (!ddq->d_rtbtimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ }
+
+ if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+ return errs;
+
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
+
+ /*
+ * Typically, a repair is only requested by quotacheck.
+ */
+ ASSERT(id != -1);
+ ASSERT(flags & XFS_QMOPT_DQREPAIR);
+ memset(d, 0, sizeof(xfs_dqblk_t));
+
+ d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+ d->dd_diskdq.d_flags = type;
+ d->dd_diskdq.d_id = cpu_to_be32(id);
+
+ return errs;
+}
+
+/*
+ * Perform a dquot buffer recovery.
+ * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
+ * (ie. USR or GRP), then just toss this buffer away; don't recover it.
+ * Else, treat it as a regular buffer and do recovery.
+ */
+STATIC void
+xlog_recover_do_dquot_buffer(
+ xfs_mount_t *mp,
+ xlog_t *log,
+ xlog_recover_item_t *item,
+ xfs_buf_t *bp,
+ xfs_buf_log_format_t *buf_f)
+{
+ uint type;
+
+ trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
+
+ /*
+ * Filesystems are required to send in quota flags at mount time.
+ */
+ if (mp->m_qflags == 0) {
+ return;
+ }
+
+ type = 0;
+ if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
+ type |= XFS_DQ_USER;
+ if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
+ type |= XFS_DQ_PROJ;
+ if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
+ type |= XFS_DQ_GROUP;
+ /*
+ * This type of quotas was turned off, so ignore this buffer
+ */
+ if (log->l_quotaoffs_flag & type)
+ return;
+
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+}
+
+/*
+ * This routine replays a modification made to a buffer at runtime.
+ * There are actually two types of buffer, regular and inode, which
+ * are handled differently. Inode buffers are handled differently
+ * in that we only recover a specific set of data from them, namely
+ * the inode di_next_unlinked fields. This is because all other inode
+ * data is actually logged via inode records and any data we replay
+ * here which overlaps that may be stale.
+ *
+ * When meta-data buffers are freed at run time we log a buffer item
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
+ * of the buffer in the log should not be replayed at recovery time.
+ * This is so that if the blocks covered by the buffer are reused for
+ * file data before we crash we don't end up replaying old, freed
+ * meta-data into a user's file.
+ *
+ * To handle the cancellation of buffer log items, we make two passes
+ * over the log during recovery. During the first we build a table of
+ * those buffers which have been cancelled, and during the second we
+ * only replay those buffers which do not have corresponding cancel
+ * records in the table. See xlog_recover_do_buffer_pass[1,2] above
+ * for more details on the implementation of the table of cancel records.
+ */
+STATIC int
+xlog_recover_buffer_pass2(
+ xlog_t *log,
+ xlog_recover_item_t *item)
+{
+ xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
+ xfs_mount_t *mp = log->l_mp;
+ xfs_buf_t *bp;
+ int error;
+ uint buf_flags;
+
+ /*
+ * In this pass we only want to recover all the buffers which have
+ * not been cancelled and are not cancellation buffers themselves.
+ */
+ if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
+ buf_f->blf_len, buf_f->blf_flags)) {
+ trace_xfs_log_recover_buf_cancel(log, buf_f);
+ return 0;
+ }
+
+ trace_xfs_log_recover_buf_recover(log, buf_f);
+
+ buf_flags = XBF_LOCK;
+ if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
+ buf_flags |= XBF_MAPPED;
+
+ bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+ buf_flags);
+ if (XFS_BUF_ISERROR(bp)) {
+ xfs_ioerror_alert("xlog_recover_do..(read#1)", mp,
+ bp, buf_f->blf_blkno);
+ error = XFS_BUF_GETERROR(bp);
+ xfs_buf_relse(bp);
+ return error;
+ }
+
+ error = 0;
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+ error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+ } else if (buf_f->blf_flags &
+ (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+ xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+ } else {
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+ }
+ if (error)
+ return XFS_ERROR(error);
+
+ /*
+ * Perform delayed write on the buffer. Asynchronous writes will be
+ * slower when taking into account all the buffers to be flushed.
+ *
+ * Also make sure that only inode buffers with good sizes stay in
+ * the buffer cache. The kernel moves inodes in buffers of 1 block
+ * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode
+ * buffers in the log can be a different size if the log was generated
+ * by an older kernel using unclustered inode buffers or a newer kernel
+ * running with a different inode cluster size. Regardless, if the
+ * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
+ * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+ * the buffer out of the buffer cache so that the buffer won't
+ * overlap with future reads of those inodes.
+ */
+ if (XFS_DINODE_MAGIC ==
+ be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
+ (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
+ (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
+ XFS_BUF_STALE(bp);
+ error = xfs_bwrite(mp, bp);
+ } else {
+ ASSERT(bp->b_target->bt_mount == mp);
+ XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+ xfs_bdwrite(mp, bp);
+ }
+
+ return (error);
+}
+
+STATIC int
+xlog_recover_inode_pass2(
+ xlog_t *log,
+ xlog_recover_item_t *item)
+{
+ xfs_inode_log_format_t *in_f;
+ xfs_mount_t *mp = log->l_mp;
+ xfs_buf_t *bp;
+ xfs_dinode_t *dip;
+ int len;
+ xfs_caddr_t src;
+ xfs_caddr_t dest;
+ int error;
+ int attr_index;
+ uint fields;
+ xfs_icdinode_t *dicp;
+ int need_free = 0;
+
+ if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
+ in_f = item->ri_buf[0].i_addr;
+ } else {
+ in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
+ need_free = 1;
+ error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
+ if (error)
+ goto error;
+ }
+
+ /*
+ * Inode buffers can be freed, look out for it,
+ * and do not replay the inode.
+ */
+ if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
+ in_f->ilf_len, 0)) {
+ error = 0;
+ trace_xfs_log_recover_inode_cancel(log, in_f);
+ goto error;
+ }
+ trace_xfs_log_recover_inode_recover(log, in_f);
+
+ bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
+ XBF_LOCK);
+ if (XFS_BUF_ISERROR(bp)) {
+ xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
+ bp, in_f->ilf_blkno);
+ error = XFS_BUF_GETERROR(bp);
+ xfs_buf_relse(bp);
+ goto error;
+ }
+ error = 0;
+ ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
+ dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
+
+ /*
+ * Make sure the place we're flushing out to really looks
+ * like an inode!
+ */
+ if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
+ xfs_buf_relse(bp);
+ xfs_alert(mp,
+ "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
+ __func__, dip, bp, in_f->ilf_ino);
+ XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
+ XFS_ERRLEVEL_LOW, mp);
+ error = EFSCORRUPTED;
+ goto error;
+ }
+ dicp = item->ri_buf[1].i_addr;
+ if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
+ xfs_buf_relse(bp);
+ xfs_alert(mp,
+ "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
+ __func__, item, in_f->ilf_ino);
+ XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
+ XFS_ERRLEVEL_LOW, mp);
+ error = EFSCORRUPTED;
+ goto error;
+ }
+
+ /* Skip replay when the on disk inode is newer than the log one */
+ if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * Deal with the wrap case, DI_MAX_FLUSH is less
+ * than smaller numbers
+ */
+ if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+ dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+ /* do nothing */
+ } else {
+ xfs_buf_relse(bp);
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto error;
+ }
+ }
+ /* Take the opportunity to reset the flush iteration count */
+ dicp->di_flushiter = 0;
+
+ if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
+ if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
+ XFS_ERRLEVEL_LOW, mp, dicp);
+ xfs_buf_relse(bp);
+ xfs_alert(mp,
+ "%s: Bad regular inode log record, rec ptr 0x%p, "
+ "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+ __func__, item, dip, bp, in_f->ilf_ino);
+ error = EFSCORRUPTED;
+ goto error;
+ }
+ } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
+ if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
+ (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
+ XFS_ERRLEVEL_LOW, mp, dicp);
+ xfs_buf_relse(bp);
+ xfs_alert(mp,
+ "%s: Bad dir inode log record, rec ptr 0x%p, "
+ "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+ __func__, item, dip, bp, in_f->ilf_ino);
+ error = EFSCORRUPTED;
+ goto error;
+ }
+ }
+ if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
+ XFS_ERRLEVEL_LOW, mp, dicp);
+ xfs_buf_relse(bp);
+ xfs_alert(mp,
+ "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+ "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+ __func__, item, dip, bp, in_f->ilf_ino,
+ dicp->di_nextents + dicp->di_anextents,
+ dicp->di_nblocks);
+ error = EFSCORRUPTED;
+ goto error;
+ }
+ if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
+ XFS_ERRLEVEL_LOW, mp, dicp);
+ xfs_buf_relse(bp);
+ xfs_alert(mp,
+ "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+ "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
+ item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
+ error = EFSCORRUPTED;
+ goto error;
+ }
+ if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
+ XFS_ERRLEVEL_LOW, mp, dicp);
+ xfs_buf_relse(bp);
+ xfs_alert(mp,
+ "%s: Bad inode log record length %d, rec ptr 0x%p",
+ __func__, item->ri_buf[1].i_len, item);
+ error = EFSCORRUPTED;
+ goto error;
+ }
+
+ /* The core is in in-core format */
+ xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
+
+ /* the rest is in on-disk format */
+ if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
+ memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
+ item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
+ item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));
+ }
+
+ fields = in_f->ilf_fields;
+ switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
+ case XFS_ILOG_DEV:
+ xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
+ break;
+ case XFS_ILOG_UUID:
+ memcpy(XFS_DFORK_DPTR(dip),
+ &in_f->ilf_u.ilfu_uuid,
+ sizeof(uuid_t));
+ break;
+ }
+
+ if (in_f->ilf_size == 2)
+ goto write_inode_buffer;
+ len = item->ri_buf[2].i_len;
+ src = item->ri_buf[2].i_addr;
+ ASSERT(in_f->ilf_size <= 4);
+ ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
+ ASSERT(!(fields & XFS_ILOG_DFORK) ||
+ (len == in_f->ilf_dsize));
+
+ switch (fields & XFS_ILOG_DFORK) {
+ case XFS_ILOG_DDATA:
+ case XFS_ILOG_DEXT:
+ memcpy(XFS_DFORK_DPTR(dip), src, len);
+ break;
+
+ case XFS_ILOG_DBROOT:
+ xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
+ (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
+ XFS_DFORK_DSIZE(dip, mp));
+ break;
+
+ default:
+ /*
+ * There are no data fork flags set.
+ */
+ ASSERT((fields & XFS_ILOG_DFORK) == 0);
+ break;
+ }
+
+ /*
+ * If we logged any attribute data, recover it. There may or
+ * may not have been any other non-core data logged in this
+ * transaction.
+ */
+ if (in_f->ilf_fields & XFS_ILOG_AFORK) {
+ if (in_f->ilf_fields & XFS_ILOG_DFORK) {
+ attr_index = 3;
+ } else {
+ attr_index = 2;
+ }
+ len = item->ri_buf[attr_index].i_len;
+ src = item->ri_buf[attr_index].i_addr;
+ ASSERT(len == in_f->ilf_asize);
+
+ switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
+ case XFS_ILOG_ADATA:
+ case XFS_ILOG_AEXT:
+ dest = XFS_DFORK_APTR(dip);
+ ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
+ memcpy(dest, src, len);
+ break;
+
+ case XFS_ILOG_ABROOT:
+ dest = XFS_DFORK_APTR(dip);
+ xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
+ len, (xfs_bmdr_block_t*)dest,
+ XFS_DFORK_ASIZE(dip, mp));
+ break;
+
+ default:
+ xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
+ ASSERT(0);
+ xfs_buf_relse(bp);
+ error = EIO;
+ goto error;
+ }
+ }
+
+write_inode_buffer:
+ ASSERT(bp->b_target->bt_mount == mp);
+ XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+ xfs_bdwrite(mp, bp);
+error:
+ if (need_free)
+ kmem_free(in_f);
+ return XFS_ERROR(error);
+}
+
+/*
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
+ * structure, so that we know not to do any dquot item or dquot buffer recovery,
+ * of that type.
+ */
+STATIC int
+xlog_recover_quotaoff_pass1(
+ xlog_t *log,
+ xlog_recover_item_t *item)
+{
+ xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
+ ASSERT(qoff_f);
+
+ /*
+ * The logitem format's flag tells us if this was user quotaoff,
+ * group/project quotaoff or both.
+ */
+ if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
+ log->l_quotaoffs_flag |= XFS_DQ_USER;
+ if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
+ log->l_quotaoffs_flag |= XFS_DQ_PROJ;
+ if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
+ log->l_quotaoffs_flag |= XFS_DQ_GROUP;
+
+ return (0);
+}
+
+/*
+ * Recover a dquot record
+ */
+STATIC int
+xlog_recover_dquot_pass2(
+ xlog_t *log,
+ xlog_recover_item_t *item)
+{
+ xfs_mount_t *mp = log->l_mp;
+ xfs_buf_t *bp;
+ struct xfs_disk_dquot *ddq, *recddq;
+ int error;
+ xfs_dq_logformat_t *dq_f;
+ uint type;
+
+
+ /*
+ * Filesystems are required to send in quota flags at mount time.
+ */
+ if (mp->m_qflags == 0)
+ return (0);
+
+ recddq = item->ri_buf[1].i_addr;
+ if (recddq == NULL) {
+ xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
+ return XFS_ERROR(EIO);
+ }
+ if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
+ xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
+ item->ri_buf[1].i_len, __func__);
+ return XFS_ERROR(EIO);
+ }
+
+ /*
+ * This type of quotas was turned off, so ignore this record.
+ */
+ type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+ ASSERT(type);
+ if (log->l_quotaoffs_flag & type)
+ return (0);
+
+ /*
+ * At this point we know that quota was _not_ turned off.
+ * Since the mount flags are not indicating to us otherwise, this
+ * must mean that quota is on, and the dquot needs to be replayed.
+ * Remember that we may not have fully recovered the superblock yet,
+ * so we can't do the usual trick of looking at the SB quota bits.
+ *
+ * The other possibility, of course, is that the quota subsystem was
+ * removed since the last mount - ENOSYS.
+ */
+ dq_f = item->ri_buf[0].i_addr;
+ ASSERT(dq_f);
+ error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+ "xlog_recover_dquot_pass2 (log copy)");
+ if (error)
+ return XFS_ERROR(EIO);
+ ASSERT(dq_f->qlf_len == 1);
+
+ error = xfs_read_buf(mp, mp->m_ddev_targp,
+ dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len),
+ 0, &bp);
+ if (error) {
+ xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
+ bp, dq_f->qlf_blkno);
+ return error;
+ }
+ ASSERT(bp);
+ ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+
+ /*
+ * At least the magic num portion should be on disk because this
+ * was among a chunk of dquots created earlier, and we did some
+ * minimal initialization then.
+ */
+ error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+ "xlog_recover_dquot_pass2");
+ if (error) {
+ xfs_buf_relse(bp);
+ return XFS_ERROR(EIO);
+ }
+
+ memcpy(ddq, recddq, item->ri_buf[1].i_len);
+
+ ASSERT(dq_f->qlf_size == 2);
+ ASSERT(bp->b_target->bt_mount == mp);
+ XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+ xfs_bdwrite(mp, bp);
+
+ return (0);
+}
+
+/*
+ * This routine is called to create an in-core extent free intent
+ * item from the efi format structure which was logged on disk.
+ * It allocates an in-core efi, copies the extents from the format
+ * structure into it, and adds the efi to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_efi_pass2(
+ xlog_t *log,
+ xlog_recover_item_t *item,
+ xfs_lsn_t lsn)
+{
+ int error;
+ xfs_mount_t *mp = log->l_mp;
+ xfs_efi_log_item_t *efip;
+ xfs_efi_log_format_t *efi_formatp;
+
+ efi_formatp = item->ri_buf[0].i_addr;
+
+ efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
+ if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
+ &(efip->efi_format)))) {
+ xfs_efi_item_free(efip);
+ return error;
+ }
+ atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
+
+ spin_lock(&log->l_ailp->xa_lock);
+ /*
+ * xfs_trans_ail_update() drops the AIL lock.
+ */
+ xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
+ return 0;
+}
+
+
+/*
+ * This routine is called when an efd format structure is found in
+ * a committed transaction in the log. It's purpose is to cancel
+ * the corresponding efi if it was still in the log. To do this
+ * it searches the AIL for the efi with an id equal to that in the
+ * efd format structure. If we find it, we remove the efi from the
+ * AIL and free it.
+ */
+STATIC int
+xlog_recover_efd_pass2(
+ xlog_t *log,
+ xlog_recover_item_t *item)
+{
+ xfs_efd_log_format_t *efd_formatp;
+ xfs_efi_log_item_t *efip = NULL;
+ xfs_log_item_t *lip;
+ __uint64_t efi_id;
+ struct xfs_ail_cursor cur;
+ struct xfs_ail *ailp = log->l_ailp;
+
+ efd_formatp = item->ri_buf[0].i_addr;
+ ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
+ ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
+ (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
+ ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
+ efi_id = efd_formatp->efd_efi_id;
+
+ /*
+ * Search for the efi with the id in the efd format structure
+ * in the AIL.
+ */
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ while (lip != NULL) {
+ if (lip->li_type == XFS_LI_EFI) {
+ efip = (xfs_efi_log_item_t *)lip;
+ if (efip->efi_format.efi_id == efi_id) {
+ /*
+ * xfs_trans_ail_delete() drops the
+ * AIL lock.
+ */
+ xfs_trans_ail_delete(ailp, lip);
+ xfs_efi_item_free(efip);
+ spin_lock(&ailp->xa_lock);
+ break;
+ }
+ }
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ }
+ xfs_trans_ail_cursor_done(ailp, &cur);
+ spin_unlock(&ailp->xa_lock);
+
+ return 0;
+}
+
+/*
+ * Free up any resources allocated by the transaction
+ *
+ * Remember that EFIs, EFDs, and IUNLINKs are handled later.
+ */
+STATIC void
+xlog_recover_free_trans(
+ struct xlog_recover *trans)
+{
+ xlog_recover_item_t *item, *n;
+ int i;
+
+ list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+ /* Free the regions in the item. */
+ list_del(&item->ri_list);
+ for (i = 0; i < item->ri_cnt; i++)
+ kmem_free(item->ri_buf[i].i_addr);
+ /* Free the item itself */
+ kmem_free(item->ri_buf);
+ kmem_free(item);
+ }
+ /* Free the transaction recover structure */
+ kmem_free(trans);
+}
+
+STATIC int
+xlog_recover_commit_pass1(
+ struct log *log,
+ struct xlog_recover *trans,
+ xlog_recover_item_t *item)
+{
+ trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+
+ switch (ITEM_TYPE(item)) {
+ case XFS_LI_BUF:
+ return xlog_recover_buffer_pass1(log, item);
+ case XFS_LI_QUOTAOFF:
+ return xlog_recover_quotaoff_pass1(log, item);
+ case XFS_LI_INODE:
+ case XFS_LI_EFI:
+ case XFS_LI_EFD:
+ case XFS_LI_DQUOT:
+ /* nothing to do in pass 1 */
+ return 0;
+ default:
+ xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+ __func__, ITEM_TYPE(item));
+ ASSERT(0);
+ return XFS_ERROR(EIO);
+ }
+}
+
+STATIC int
+xlog_recover_commit_pass2(
+ struct log *log,
+ struct xlog_recover *trans,
+ xlog_recover_item_t *item)
+{
+ trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+
+ switch (ITEM_TYPE(item)) {
+ case XFS_LI_BUF:
+ return xlog_recover_buffer_pass2(log, item);
+ case XFS_LI_INODE:
+ return xlog_recover_inode_pass2(log, item);
+ case XFS_LI_EFI:
+ return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+ case XFS_LI_EFD:
+ return xlog_recover_efd_pass2(log, item);
+ case XFS_LI_DQUOT:
+ return xlog_recover_dquot_pass2(log, item);
+ case XFS_LI_QUOTAOFF:
+ /* nothing to do in pass2 */
+ return 0;
+ default:
+ xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+ __func__, ITEM_TYPE(item));
+ ASSERT(0);
+ return XFS_ERROR(EIO);
+ }
+}
+
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now. Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
+xlog_recover_commit_trans(
+ struct log *log,
+ struct xlog_recover *trans,
+ int pass)
+{
+ int error = 0;
+ xlog_recover_item_t *item;
+
+ hlist_del(&trans->r_list);
+
+ error = xlog_recover_reorder_trans(log, trans, pass);
+ if (error)
+ return error;
+
+ list_for_each_entry(item, &trans->r_itemq, ri_list) {
+ if (pass == XLOG_RECOVER_PASS1)
+ error = xlog_recover_commit_pass1(log, trans, item);
+ else
+ error = xlog_recover_commit_pass2(log, trans, item);
+ if (error)
+ return error;
+ }
+
+ xlog_recover_free_trans(trans);
+ return 0;
+}
+
+STATIC int
+xlog_recover_unmount_trans(
+ struct log *log,
+ xlog_recover_t *trans)
+{
+ /* Do nothing now */
+ xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
+ return 0;
+}
+
+/*
+ * There are two valid states of the r_state field. 0 indicates that the
+ * transaction structure is in a normal state. We have either seen the
+ * start of the transaction or the last operation we added was not a partial
+ * operation. If the last operation we added to the transaction was a
+ * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
+ *
+ * NOTE: skip LRs with 0 data length.
+ */
+STATIC int
+xlog_recover_process_data(
+ xlog_t *log,
+ struct hlist_head rhash[],
+ xlog_rec_header_t *rhead,
+ xfs_caddr_t dp,
+ int pass)
+{
+ xfs_caddr_t lp;
+ int num_logops;
+ xlog_op_header_t *ohead;
+ xlog_recover_t *trans;
+ xlog_tid_t tid;
+ int error;
+ unsigned long hash;
+ uint flags;
+
+ lp = dp + be32_to_cpu(rhead->h_len);
+ num_logops = be32_to_cpu(rhead->h_num_logops);
+
+ /* check the log format matches our own - else we can't recover */
+ if (xlog_header_check_recover(log->l_mp, rhead))
+ return (XFS_ERROR(EIO));
+
+ while ((dp < lp) && num_logops) {
+ ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
+ ohead = (xlog_op_header_t *)dp;
+ dp += sizeof(xlog_op_header_t);
+ if (ohead->oh_clientid != XFS_TRANSACTION &&
+ ohead->oh_clientid != XFS_LOG) {
+ xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
+ __func__, ohead->oh_clientid);
+ ASSERT(0);
+ return (XFS_ERROR(EIO));
+ }
+ tid = be32_to_cpu(ohead->oh_tid);
+ hash = XLOG_RHASH(tid);
+ trans = xlog_recover_find_tid(&rhash[hash], tid);
+ if (trans == NULL) { /* not found; add new tid */
+ if (ohead->oh_flags & XLOG_START_TRANS)
+ xlog_recover_new_tid(&rhash[hash], tid,
+ be64_to_cpu(rhead->h_lsn));
+ } else {
+ if (dp + be32_to_cpu(ohead->oh_len) > lp) {
+ xfs_warn(log->l_mp, "%s: bad length 0x%x",
+ __func__, be32_to_cpu(ohead->oh_len));
+ WARN_ON(1);
+ return (XFS_ERROR(EIO));
+ }
+ flags = ohead->oh_flags & ~XLOG_END_TRANS;
+ if (flags & XLOG_WAS_CONT_TRANS)
+ flags &= ~XLOG_CONTINUE_TRANS;
+ switch (flags) {
+ case XLOG_COMMIT_TRANS:
+ error = xlog_recover_commit_trans(log,
+ trans, pass);
+ break;
+ case XLOG_UNMOUNT_TRANS:
+ error = xlog_recover_unmount_trans(log, trans);
+ break;
+ case XLOG_WAS_CONT_TRANS:
+ error = xlog_recover_add_to_cont_trans(log,
+ trans, dp,
+ be32_to_cpu(ohead->oh_len));
+ break;
+ case XLOG_START_TRANS:
+ xfs_warn(log->l_mp, "%s: bad transaction",
+ __func__);
+ ASSERT(0);
+ error = XFS_ERROR(EIO);
+ break;
+ case 0:
+ case XLOG_CONTINUE_TRANS:
+ error = xlog_recover_add_to_trans(log, trans,
+ dp, be32_to_cpu(ohead->oh_len));
+ break;
+ default:
+ xfs_warn(log->l_mp, "%s: bad flag 0x%x",
+ __func__, flags);
+ ASSERT(0);
+ error = XFS_ERROR(EIO);
+ break;
+ }
+ if (error)
+ return error;
+ }
+ dp += be32_to_cpu(ohead->oh_len);
+ num_logops--;
+ }
+ return 0;
+}
+
+/*
+ * Process an extent free intent item that was recovered from
+ * the log. We need to free the extents that it describes.
+ */
+STATIC int
+xlog_recover_process_efi(
+ xfs_mount_t *mp,
+ xfs_efi_log_item_t *efip)
+{
+ xfs_efd_log_item_t *efdp;
+ xfs_trans_t *tp;
+ int i;
+ int error = 0;
+ xfs_extent_t *extp;
+ xfs_fsblock_t startblock_fsb;
+
+ ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
+
+ /*
+ * First check the validity of the extents described by the
+ * EFI. If any are bad, then assume that all are bad and
+ * just toss the EFI.
+ */
+ for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+ extp = &(efip->efi_format.efi_extents[i]);
+ startblock_fsb = XFS_BB_TO_FSB(mp,
+ XFS_FSB_TO_DADDR(mp, extp->ext_start));
+ if ((startblock_fsb == 0) ||
+ (extp->ext_len == 0) ||
+ (startblock_fsb >= mp->m_sb.sb_dblocks) ||
+ (extp->ext_len >= mp->m_sb.sb_agblocks)) {
+ /*
+ * This will pull the EFI from the AIL and
+ * free the memory associated with it.
+ */
+ xfs_efi_release(efip, efip->efi_format.efi_nextents);
+ return XFS_ERROR(EIO);
+ }
+ }
+
+ tp = xfs_trans_alloc(mp, 0);
+ error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+ if (error)
+ goto abort_error;
+ efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+
+ for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+ extp = &(efip->efi_format.efi_extents[i]);
+ error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+ if (error)
+ goto abort_error;
+ xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
+ extp->ext_len);
+ }
+
+ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+ error = xfs_trans_commit(tp, 0);
+ return error;
+
+abort_error:
+ xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ return error;
+}
+
+/*
+ * When this is called, all of the EFIs which did not have
+ * corresponding EFDs should be in the AIL. What we do now
+ * is free the extents associated with each one.
+ *
+ * Since we process the EFIs in normal transactions, they
+ * will be removed at some point after the commit. This prevents
+ * us from just walking down the list processing each one.
+ * We'll use a flag in the EFI to skip those that we've already
+ * processed and use the AIL iteration mechanism's generation
+ * count to try to speed this up at least a bit.
+ *
+ * When we start, we know that the EFIs are the only things in
+ * the AIL. As we process them, however, other items are added
+ * to the AIL. Since everything added to the AIL must come after
+ * everything already in the AIL, we stop processing as soon as
+ * we see something other than an EFI in the AIL.
+ */
+STATIC int
+xlog_recover_process_efis(
+ xlog_t *log)
+{
+ xfs_log_item_t *lip;
+ xfs_efi_log_item_t *efip;
+ int error = 0;
+ struct xfs_ail_cursor cur;
+ struct xfs_ail *ailp;
+
+ ailp = log->l_ailp;
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ while (lip != NULL) {
+ /*
+ * We're done when we see something other than an EFI.
+ * There should be no EFIs left in the AIL now.
+ */
+ if (lip->li_type != XFS_LI_EFI) {
+#ifdef DEBUG
+ for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+ ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
+ break;
+ }
+
+ /*
+ * Skip EFIs that we've already processed.
+ */
+ efip = (xfs_efi_log_item_t *)lip;
+ if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ continue;
+ }
+
+ spin_unlock(&ailp->xa_lock);
+ error = xlog_recover_process_efi(log->l_mp, efip);
+ spin_lock(&ailp->xa_lock);
+ if (error)
+ goto out;
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ }
+out:
+ xfs_trans_ail_cursor_done(ailp, &cur);
+ spin_unlock(&ailp->xa_lock);
+ return error;
+}
+
+/*
+ * This routine performs a transaction to null out a bad inode pointer
+ * in an agi unlinked inode hash bucket.
+ */
+STATIC void
+xlog_recover_clear_agi_bucket(
+ xfs_mount_t *mp,
+ xfs_agnumber_t agno,
+ int bucket)
+{
+ xfs_trans_t *tp;
+ xfs_agi_t *agi;
+ xfs_buf_t *agibp;
+ int offset;
+ int error;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
+ error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
+ 0, 0, 0);
+ if (error)
+ goto out_abort;
+
+ error = xfs_read_agi(mp, tp, agno, &agibp);
+ if (error)
+ goto out_abort;
+
+ agi = XFS_BUF_TO_AGI(agibp);
+ agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+ offset = offsetof(xfs_agi_t, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket);
+ xfs_trans_log_buf(tp, agibp, offset,
+ (offset + sizeof(xfs_agino_t) - 1));
+
+ error = xfs_trans_commit(tp, 0);
+ if (error)
+ goto out_error;
+ return;
+
+out_abort:
+ xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+out_error:
+ xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
+ return;
+}
+
+STATIC xfs_agino_t
+xlog_recover_process_one_iunlink(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ int bucket)
+{
+ struct xfs_buf *ibp;
+ struct xfs_dinode *dip;
+ struct xfs_inode *ip;
+ xfs_ino_t ino;
+ int error;
+
+ ino = XFS_AGINO_TO_INO(mp, agno, agino);
+ error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+ if (error)
+ goto fail;
+
+ /*
+ * Get the on disk inode to find the next inode in the bucket.
+ */
+ error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
+ if (error)
+ goto fail_iput;
+
+ ASSERT(ip->i_d.di_nlink == 0);
+ ASSERT(ip->i_d.di_mode != 0);
+
+ /* setup for the next pass */
+ agino = be32_to_cpu(dip->di_next_unlinked);
+ xfs_buf_relse(ibp);
+
+ /*
+ * Prevent any DMAPI event from being sent when the reference on
+ * the inode is dropped.
+ */
+ ip->i_d.di_dmevmask = 0;
+
+ IRELE(ip);
+ return agino;
+
+ fail_iput:
+ IRELE(ip);
+ fail:
+ /*
+ * We can't read in the inode this bucket points to, or this inode
+ * is messed up. Just ditch this bucket of inodes. We will lose
+ * some inodes and space, but at least we won't hang.
+ *
+ * Call xlog_recover_clear_agi_bucket() to perform a transaction to
+ * clear the inode pointer in the bucket.
+ */
+ xlog_recover_clear_agi_bucket(mp, agno, bucket);
+ return NULLAGINO;
+}
+
+/*
+ * xlog_iunlink_recover
+ *
+ * This is called during recovery to process any inodes which
+ * we unlinked but not freed when the system crashed. These
+ * inodes will be on the lists in the AGI blocks. What we do
+ * here is scan all the AGIs and fully truncate and free any
+ * inodes found on the lists. Each inode is removed from the
+ * lists when it has been fully truncated and is freed. The
+ * freeing of the inode and its removal from the list must be
+ * atomic.
+ */
+STATIC void
+xlog_recover_process_iunlinks(
+ xlog_t *log)
+{
+ xfs_mount_t *mp;
+ xfs_agnumber_t agno;
+ xfs_agi_t *agi;
+ xfs_buf_t *agibp;
+ xfs_agino_t agino;
+ int bucket;
+ int error;
+ uint mp_dmevmask;
+
+ mp = log->l_mp;
+
+ /*
+ * Prevent any DMAPI event from being sent while in this function.
+ */
+ mp_dmevmask = mp->m_dmevmask;
+ mp->m_dmevmask = 0;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ /*
+ * Find the agi for this ag.
+ */
+ error = xfs_read_agi(mp, NULL, agno, &agibp);
+ if (error) {
+ /*
+ * AGI is b0rked. Don't process it.
+ *
+ * We should probably mark the filesystem as corrupt
+ * after we've recovered all the ag's we can....
+ */
+ continue;
+ }
+ /*
+ * Unlock the buffer so that it can be acquired in the normal
+ * course of the transaction to truncate and free each inode.
+ * Because we are not racing with anyone else here for the AGI
+ * buffer, we don't even need to hold it locked to read the
+ * initial unlinked bucket entries out of the buffer. We keep
+ * buffer reference though, so that it stays pinned in memory
+ * while we need the buffer.
+ */
+ agi = XFS_BUF_TO_AGI(agibp);
+ xfs_buf_unlock(agibp);
+
+ for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+ agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+ while (agino != NULLAGINO) {
+ agino = xlog_recover_process_one_iunlink(mp,
+ agno, agino, bucket);
+ }
+ }
+ xfs_buf_rele(agibp);
+ }
+
+ mp->m_dmevmask = mp_dmevmask;
+}
+
+
+#ifdef DEBUG
+STATIC void
+xlog_pack_data_checksum(
+ xlog_t *log,
+ xlog_in_core_t *iclog,
+ int size)
+{
+ int i;
+ __be32 *up;
+ uint chksum = 0;
+
+ up = (__be32 *)iclog->ic_datap;
+ /* divide length by 4 to get # words */
+ for (i = 0; i < (size >> 2); i++) {
+ chksum ^= be32_to_cpu(*up);
+ up++;
+ }
+ iclog->ic_header.h_chksum = cpu_to_be32(chksum);
+}
+#else
+#define xlog_pack_data_checksum(log, iclog, size)
+#endif
+
+/*
+ * Stamp cycle number in every block
+ */
+void
+xlog_pack_data(
+ xlog_t *log,
+ xlog_in_core_t *iclog,
+ int roundoff)
+{
+ int i, j, k;
+ int size = iclog->ic_offset + roundoff;
+ __be32 cycle_lsn;
+ xfs_caddr_t dp;
+
+ xlog_pack_data_checksum(log, iclog, size);
+
+ cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+
+ dp = iclog->ic_datap;
+ for (i = 0; i < BTOBB(size) &&
+ i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+ iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+ *(__be32 *)dp = cycle_lsn;
+ dp += BBSIZE;
+ }
+
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ xlog_in_core_2_t *xhdr = iclog->ic_data;
+
+ for ( ; i < BTOBB(size); i++) {
+ j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
+ *(__be32 *)dp = cycle_lsn;
+ dp += BBSIZE;
+ }
+
+ for (i = 1; i < log->l_iclog_heads; i++) {
+ xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+ }
+ }
+}
+
+STATIC void
+xlog_unpack_data(
+ xlog_rec_header_t *rhead,
+ xfs_caddr_t dp,
+ xlog_t *log)
+{
+ int i, j, k;
+
+ for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
+ i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+ *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
+ dp += BBSIZE;
+ }
+
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
+ for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
+ j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
+ dp += BBSIZE;
+ }
+ }
+}
+
+STATIC int
+xlog_valid_rec_header(
+ xlog_t *log,
+ xlog_rec_header_t *rhead,
+ xfs_daddr_t blkno)
+{
+ int hlen;
+
+ if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) {
+ XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
+ XFS_ERRLEVEL_LOW, log->l_mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ if (unlikely(
+ (!rhead->h_version ||
+ (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
+ xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
+ __func__, be32_to_cpu(rhead->h_version));
+ return XFS_ERROR(EIO);
+ }
+
+ /* LR body must have data or it wouldn't have been written */
+ hlen = be32_to_cpu(rhead->h_len);
+ if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
+ XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
+ XFS_ERRLEVEL_LOW, log->l_mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
+ XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
+ XFS_ERRLEVEL_LOW, log->l_mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+ return 0;
+}
+
+/*
+ * Read the log from tail to head and process the log records found.
+ * Handle the two cases where the tail and head are in the same cycle
+ * and where the active portion of the log wraps around the end of
+ * the physical log separately. The pass parameter is passed through
+ * to the routines called to process the data and is not looked at
+ * here.
+ */
+STATIC int
+xlog_do_recovery_pass(
+ xlog_t *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk,
+ int pass)
+{
+ xlog_rec_header_t *rhead;
+ xfs_daddr_t blk_no;
+ xfs_caddr_t offset;
+ xfs_buf_t *hbp, *dbp;
+ int error = 0, h_size;
+ int bblks, split_bblks;
+ int hblks, split_hblks, wrapped_hblks;
+ struct hlist_head rhash[XLOG_RHASH_SIZE];
+
+ ASSERT(head_blk != tail_blk);
+
+ /*
+ * Read the header of the tail block and get the iclog buffer size from
+ * h_size. Use this to tell how many sectors make up the log header.
+ */
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ /*
+ * When using variable length iclogs, read first sector of
+ * iclog header and extract the header size from it. Get a
+ * new hbp that is the correct size.
+ */
+ hbp = xlog_get_bp(log, 1);
+ if (!hbp)
+ return ENOMEM;
+
+ error = xlog_bread(log, tail_blk, 1, hbp, &offset);
+ if (error)
+ goto bread_err1;
+
+ rhead = (xlog_rec_header_t *)offset;
+ error = xlog_valid_rec_header(log, rhead, tail_blk);
+ if (error)
+ goto bread_err1;
+ h_size = be32_to_cpu(rhead->h_size);
+ if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
+ (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+ hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+ if (h_size % XLOG_HEADER_CYCLE_SIZE)
+ hblks++;
+ xlog_put_bp(hbp);
+ hbp = xlog_get_bp(log, hblks);
+ } else {
+ hblks = 1;
+ }
+ } else {
+ ASSERT(log->l_sectBBsize == 1);
+ hblks = 1;
+ hbp = xlog_get_bp(log, 1);
+ h_size = XLOG_BIG_RECORD_BSIZE;
+ }
+
+ if (!hbp)
+ return ENOMEM;
+ dbp = xlog_get_bp(log, BTOBB(h_size));
+ if (!dbp) {
+ xlog_put_bp(hbp);
+ return ENOMEM;
+ }
+
+ memset(rhash, 0, sizeof(rhash));
+ if (tail_blk <= head_blk) {
+ for (blk_no = tail_blk; blk_no < head_blk; ) {
+ error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+ if (error)
+ goto bread_err2;
+
+ rhead = (xlog_rec_header_t *)offset;
+ error = xlog_valid_rec_header(log, rhead, blk_no);
+ if (error)
+ goto bread_err2;
+
+ /* blocks in data section */
+ bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+ error = xlog_bread(log, blk_no + hblks, bblks, dbp,
+ &offset);
+ if (error)
+ goto bread_err2;
+
+ xlog_unpack_data(rhead, offset, log);
+ if ((error = xlog_recover_process_data(log,
+ rhash, rhead, offset, pass)))
+ goto bread_err2;
+ blk_no += bblks + hblks;
+ }
+ } else {
+ /*
+ * Perform recovery around the end of the physical log.
+ * When the head is not on the same cycle number as the tail,
+ * we can't do a sequential recovery as above.
+ */
+ blk_no = tail_blk;
+ while (blk_no < log->l_logBBsize) {
+ /*
+ * Check for header wrapping around physical end-of-log
+ */
+ offset = XFS_BUF_PTR(hbp);
+ split_hblks = 0;
+ wrapped_hblks = 0;
+ if (blk_no + hblks <= log->l_logBBsize) {
+ /* Read header in one read */
+ error = xlog_bread(log, blk_no, hblks, hbp,
+ &offset);
+ if (error)
+ goto bread_err2;
+ } else {
+ /* This LR is split across physical log end */
+ if (blk_no != log->l_logBBsize) {
+ /* some data before physical log end */
+ ASSERT(blk_no <= INT_MAX);
+ split_hblks = log->l_logBBsize - (int)blk_no;
+ ASSERT(split_hblks > 0);
+ error = xlog_bread(log, blk_no,
+ split_hblks, hbp,
+ &offset);
+ if (error)
+ goto bread_err2;
+ }
+
+ /*
+ * Note: this black magic still works with
+ * large sector sizes (non-512) only because:
+ * - we increased the buffer size originally
+ * by 1 sector giving us enough extra space
+ * for the second read;
+ * - the log start is guaranteed to be sector
+ * aligned;
+ * - we read the log end (LR header start)
+ * _first_, then the log start (LR header end)
+ * - order is important.
+ */
+ wrapped_hblks = hblks - split_hblks;
+ error = xlog_bread_offset(log, 0,
+ wrapped_hblks, hbp,
+ offset + BBTOB(split_hblks));
+ if (error)
+ goto bread_err2;
+ }
+ rhead = (xlog_rec_header_t *)offset;
+ error = xlog_valid_rec_header(log, rhead,
+ split_hblks ? blk_no : 0);
+ if (error)
+ goto bread_err2;
+
+ bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+ blk_no += hblks;
+
+ /* Read in data for log record */
+ if (blk_no + bblks <= log->l_logBBsize) {
+ error = xlog_bread(log, blk_no, bblks, dbp,
+ &offset);
+ if (error)
+ goto bread_err2;
+ } else {
+ /* This log record is split across the
+ * physical end of log */
+ offset = XFS_BUF_PTR(dbp);
+ split_bblks = 0;
+ if (blk_no != log->l_logBBsize) {
+ /* some data is before the physical
+ * end of log */
+ ASSERT(!wrapped_hblks);
+ ASSERT(blk_no <= INT_MAX);
+ split_bblks =
+ log->l_logBBsize - (int)blk_no;
+ ASSERT(split_bblks > 0);
+ error = xlog_bread(log, blk_no,
+ split_bblks, dbp,
+ &offset);
+ if (error)
+ goto bread_err2;
+ }
+
+ /*
+ * Note: this black magic still works with
+ * large sector sizes (non-512) only because:
+ * - we increased the buffer size originally
+ * by 1 sector giving us enough extra space
+ * for the second read;
+ * - the log start is guaranteed to be sector
+ * aligned;
+ * - we read the log end (LR header start)
+ * _first_, then the log start (LR header end)
+ * - order is important.
+ */
+ error = xlog_bread_offset(log, 0,
+ bblks - split_bblks, hbp,
+ offset + BBTOB(split_bblks));
+ if (error)
+ goto bread_err2;
+ }
+ xlog_unpack_data(rhead, offset, log);
+ if ((error = xlog_recover_process_data(log, rhash,
+ rhead, offset, pass)))
+ goto bread_err2;
+ blk_no += bblks;
+ }
+
+ ASSERT(blk_no >= log->l_logBBsize);
+ blk_no -= log->l_logBBsize;
+
+ /* read first part of physical log */
+ while (blk_no < head_blk) {
+ error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+ if (error)
+ goto bread_err2;
+
+ rhead = (xlog_rec_header_t *)offset;
+ error = xlog_valid_rec_header(log, rhead, blk_no);
+ if (error)
+ goto bread_err2;
+
+ bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+ error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+ &offset);
+ if (error)
+ goto bread_err2;
+
+ xlog_unpack_data(rhead, offset, log);
+ if ((error = xlog_recover_process_data(log, rhash,
+ rhead, offset, pass)))
+ goto bread_err2;
+ blk_no += bblks + hblks;
+ }
+ }
+
+ bread_err2:
+ xlog_put_bp(dbp);
+ bread_err1:
+ xlog_put_bp(hbp);
+ return error;
+}
+
+/*
+ * Do the recovery of the log. We actually do this in two phases.
+ * The two passes are necessary in order to implement the function
+ * of cancelling a record written into the log. The first pass
+ * determines those things which have been cancelled, and the
+ * second pass replays log items normally except for those which
+ * have been cancelled. The handling of the replay and cancellations
+ * takes place in the log item type specific routines.
+ *
+ * The table of items which have cancel records in the log is allocated
+ * and freed at this level, since only here do we know when all of
+ * the log recovery has been completed.
+ */
+STATIC int
+xlog_do_log_recovery(
+ xlog_t *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
+{
+ int error, i;
+
+ ASSERT(head_blk != tail_blk);
+
+ /*
+ * First do a pass to find all of the cancelled buf log items.
+ * Store them in the buf_cancel_table for use in the second pass.
+ */
+ log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
+ sizeof(struct list_head),
+ KM_SLEEP);
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
+ error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+ XLOG_RECOVER_PASS1);
+ if (error != 0) {
+ kmem_free(log->l_buf_cancel_table);
+ log->l_buf_cancel_table = NULL;
+ return error;
+ }
+ /*
+ * Then do a second pass to actually recover the items in the log.
+ * When it is complete free the table of buf cancel items.
+ */
+ error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+ XLOG_RECOVER_PASS2);
+#ifdef DEBUG
+ if (!error) {
+ int i;
+
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+ ASSERT(list_empty(&log->l_buf_cancel_table[i]));
+ }
+#endif /* DEBUG */
+
+ kmem_free(log->l_buf_cancel_table);
+ log->l_buf_cancel_table = NULL;
+
+ return error;
+}
+
+/*
+ * Do the actual recovery
+ */
+STATIC int
+xlog_do_recover(
+ xlog_t *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
+{
+ int error;
+ xfs_buf_t *bp;
+ xfs_sb_t *sbp;
+
+ /*
+ * First replay the images in the log.
+ */
+ error = xlog_do_log_recovery(log, head_blk, tail_blk);
+ if (error) {
+ return error;
+ }
+
+ XFS_bflush(log->l_mp->m_ddev_targp);
+
+ /*
+ * If IO errors happened during recovery, bail out.
+ */
+ if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ return (EIO);
+ }
+
+ /*
+ * We now update the tail_lsn since much of the recovery has completed
+ * and there may be space available to use. If there were no extent
+ * or iunlinks, we can free up the entire log and set the tail_lsn to
+ * be the last_sync_lsn. This was set in xlog_find_tail to be the
+ * lsn of the last known good LR on disk. If there are extent frees
+ * or iunlinks they will have some entries in the AIL; so we look at
+ * the AIL to determine how to set the tail_lsn.
+ */
+ xlog_assign_tail_lsn(log->l_mp);
+
+ /*
+ * Now that we've finished replaying all buffer and inode
+ * updates, re-read in the superblock.
+ */
+ bp = xfs_getsb(log->l_mp, 0);
+ XFS_BUF_UNDONE(bp);
+ ASSERT(!(XFS_BUF_ISWRITE(bp)));
+ ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+ XFS_BUF_READ(bp);
+ XFS_BUF_UNASYNC(bp);
+ xfsbdstrat(log->l_mp, bp);
+ error = xfs_buf_iowait(bp);
+ if (error) {
+ xfs_ioerror_alert("xlog_do_recover",
+ log->l_mp, bp, XFS_BUF_ADDR(bp));
+ ASSERT(0);
+ xfs_buf_relse(bp);
+ return error;
+ }
+
+ /* Convert superblock from on-disk format */
+ sbp = &log->l_mp->m_sb;
+ xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
+ ASSERT(xfs_sb_good_version(sbp));
+ xfs_buf_relse(bp);
+
+ /* We've re-read the superblock so re-initialize per-cpu counters */
+ xfs_icsb_reinit_counters(log->l_mp);
+
+ xlog_recover_check_summary(log);
+
+ /* Normal transactions can now occur */
+ log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+ return 0;
+}
+
+/*
+ * Perform recovery and re-initialize some log variables in xlog_find_tail.
+ *
+ * Return error or zero.
+ */
+int
+xlog_recover(
+ xlog_t *log)
+{
+ xfs_daddr_t head_blk, tail_blk;
+ int error;
+
+ /* find the tail of the log */
+ if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
+ return error;
+
+ if (tail_blk != head_blk) {
+ /* There used to be a comment here:
+ *
+ * disallow recovery on read-only mounts. note -- mount
+ * checks for ENOSPC and turns it into an intelligent
+ * error message.
+ * ...but this is no longer true. Now, unless you specify
+ * NORECOVERY (in which case this function would never be
+ * called), we just go ahead and recover. We do this all
+ * under the vfs layer, so we can get away with it unless
+ * the device itself is read-only, in which case we fail.
+ */
+ if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
+ return error;
+ }
+
+ xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
+ log->l_mp->m_logname ? log->l_mp->m_logname
+ : "internal");
+
+ error = xlog_do_recover(log, head_blk, tail_blk);
+ log->l_flags |= XLOG_RECOVERY_NEEDED;
+ }
+ return error;
+}
+
+/*
+ * In the first part of recovery we replay inodes and buffers and build
+ * up the list of extent free items which need to be processed. Here
+ * we process the extent free items and clean up the on disk unlinked
+ * inode lists. This is separated from the first part of recovery so
+ * that the root and real-time bitmap inodes can be read in from disk in
+ * between the two stages. This is necessary so that we can free space
+ * in the real-time portion of the file system.
+ */
+int
+xlog_recover_finish(
+ xlog_t *log)
+{
+ /*
+ * Now we're ready to do the transactions needed for the
+ * rest of recovery. Start with completing all the extent
+ * free intent records and then process the unlinked inode
+ * lists. At this point, we essentially run in normal mode
+ * except that we're still performing recovery actions
+ * rather than accepting new requests.
+ */
+ if (log->l_flags & XLOG_RECOVERY_NEEDED) {
+ int error;
+ error = xlog_recover_process_efis(log);
+ if (error) {
+ xfs_alert(log->l_mp, "Failed to recover EFIs");
+ return error;
+ }
+ /*
+ * Sync the log to get all the EFIs out of the AIL.
+ * This isn't absolutely necessary, but it helps in
+ * case the unlink transactions would have problems
+ * pushing the EFIs out of the way.
+ */
+ xfs_log_force(log->l_mp, XFS_LOG_SYNC);
+
+ xlog_recover_process_iunlinks(log);
+
+ xlog_recover_check_summary(log);
+
+ xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
+ log->l_mp->m_logname ? log->l_mp->m_logname
+ : "internal");
+ log->l_flags &= ~XLOG_RECOVERY_NEEDED;
+ } else {
+ xfs_info(log->l_mp, "Ending clean mount");
+ }
+ return 0;
+}
+
+
+#if defined(DEBUG)
+/*
+ * Read all of the agf and agi counters and check that they
+ * are consistent with the superblock counters.
+ */
+void
+xlog_recover_check_summary(
+ xlog_t *log)
+{
+ xfs_mount_t *mp;
+ xfs_agf_t *agfp;
+ xfs_buf_t *agfbp;
+ xfs_buf_t *agibp;
+ xfs_agnumber_t agno;
+ __uint64_t freeblks;
+ __uint64_t itotal;
+ __uint64_t ifree;
+ int error;
+
+ mp = log->l_mp;
+
+ freeblks = 0LL;
+ itotal = 0LL;
+ ifree = 0LL;
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
+ if (error) {
+ xfs_alert(mp, "%s agf read failed agno %d error %d",
+ __func__, agno, error);
+ } else {
+ agfp = XFS_BUF_TO_AGF(agfbp);
+ freeblks += be32_to_cpu(agfp->agf_freeblks) +
+ be32_to_cpu(agfp->agf_flcount);
+ xfs_buf_relse(agfbp);
+ }
+
+ error = xfs_read_agi(mp, NULL, agno, &agibp);
+ if (error) {
+ xfs_alert(mp, "%s agi read failed agno %d error %d",
+ __func__, agno, error);
+ } else {
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+
+ itotal += be32_to_cpu(agi->agi_count);
+ ifree += be32_to_cpu(agi->agi_freecount);
+ xfs_buf_relse(agibp);
+ }
+ }
+}
+#endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
new file mode 100644
index 00000000..1c55ccbb
--- /dev/null
+++ b/fs/xfs/xfs_log_recover.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_LOG_RECOVER_H__
+#define __XFS_LOG_RECOVER_H__
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_RHASH_BITS 4
+#define XLOG_RHASH_SIZE 16
+#define XLOG_RHASH_SHIFT 2
+#define XLOG_RHASH(tid) \
+ ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
+
+#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
+
+
+/*
+ * item headers are in ri_buf[0]. Additional buffers follow.
+ */
+typedef struct xlog_recover_item {
+ struct list_head ri_list;
+ int ri_type;
+ int ri_cnt; /* count of regions found */
+ int ri_total; /* total regions */
+ xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
+} xlog_recover_item_t;
+
+struct xlog_tid;
+typedef struct xlog_recover {
+ struct hlist_node r_list;
+ xlog_tid_t r_log_tid; /* log's transaction id */
+ xfs_trans_header_t r_theader; /* trans header for partial */
+ int r_state; /* not needed */
+ xfs_lsn_t r_lsn; /* xact lsn */
+ struct list_head r_itemq; /* q for items */
+} xlog_recover_t;
+
+#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr)
+
+/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define XLOG_BC_TABLE_SIZE 64
+
+#define XLOG_RECOVER_PASS1 1
+#define XLOG_RECOVER_PASS2 2
+
+#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
new file mode 100644
index 00000000..9afdd497
--- /dev/null
+++ b/fs/xfs/xfs_mount.c
@@ -0,0 +1,2585 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_quota.h"
+#include "xfs_fsops.h"
+#include "xfs_utils.h"
+#include "xfs_trace.h"
+
+
+#ifdef HAVE_PERCPU_SB
+STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
+ int);
+STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
+ int);
+STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
+#else
+
+#define xfs_icsb_balance_counter(mp, a, b) do { } while (0)
+#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
+#endif
+
+static const struct {
+ short offset;
+ short type; /* 0 = integer
+ * 1 = binary / string (no translation)
+ */
+} xfs_sb_info[] = {
+ { offsetof(xfs_sb_t, sb_magicnum), 0 },
+ { offsetof(xfs_sb_t, sb_blocksize), 0 },
+ { offsetof(xfs_sb_t, sb_dblocks), 0 },
+ { offsetof(xfs_sb_t, sb_rblocks), 0 },
+ { offsetof(xfs_sb_t, sb_rextents), 0 },
+ { offsetof(xfs_sb_t, sb_uuid), 1 },
+ { offsetof(xfs_sb_t, sb_logstart), 0 },
+ { offsetof(xfs_sb_t, sb_rootino), 0 },
+ { offsetof(xfs_sb_t, sb_rbmino), 0 },
+ { offsetof(xfs_sb_t, sb_rsumino), 0 },
+ { offsetof(xfs_sb_t, sb_rextsize), 0 },
+ { offsetof(xfs_sb_t, sb_agblocks), 0 },
+ { offsetof(xfs_sb_t, sb_agcount), 0 },
+ { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
+ { offsetof(xfs_sb_t, sb_logblocks), 0 },
+ { offsetof(xfs_sb_t, sb_versionnum), 0 },
+ { offsetof(xfs_sb_t, sb_sectsize), 0 },
+ { offsetof(xfs_sb_t, sb_inodesize), 0 },
+ { offsetof(xfs_sb_t, sb_inopblock), 0 },
+ { offsetof(xfs_sb_t, sb_fname[0]), 1 },
+ { offsetof(xfs_sb_t, sb_blocklog), 0 },
+ { offsetof(xfs_sb_t, sb_sectlog), 0 },
+ { offsetof(xfs_sb_t, sb_inodelog), 0 },
+ { offsetof(xfs_sb_t, sb_inopblog), 0 },
+ { offsetof(xfs_sb_t, sb_agblklog), 0 },
+ { offsetof(xfs_sb_t, sb_rextslog), 0 },
+ { offsetof(xfs_sb_t, sb_inprogress), 0 },
+ { offsetof(xfs_sb_t, sb_imax_pct), 0 },
+ { offsetof(xfs_sb_t, sb_icount), 0 },
+ { offsetof(xfs_sb_t, sb_ifree), 0 },
+ { offsetof(xfs_sb_t, sb_fdblocks), 0 },
+ { offsetof(xfs_sb_t, sb_frextents), 0 },
+ { offsetof(xfs_sb_t, sb_uquotino), 0 },
+ { offsetof(xfs_sb_t, sb_gquotino), 0 },
+ { offsetof(xfs_sb_t, sb_qflags), 0 },
+ { offsetof(xfs_sb_t, sb_flags), 0 },
+ { offsetof(xfs_sb_t, sb_shared_vn), 0 },
+ { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
+ { offsetof(xfs_sb_t, sb_unit), 0 },
+ { offsetof(xfs_sb_t, sb_width), 0 },
+ { offsetof(xfs_sb_t, sb_dirblklog), 0 },
+ { offsetof(xfs_sb_t, sb_logsectlog), 0 },
+ { offsetof(xfs_sb_t, sb_logsectsize),0 },
+ { offsetof(xfs_sb_t, sb_logsunit), 0 },
+ { offsetof(xfs_sb_t, sb_features2), 0 },
+ { offsetof(xfs_sb_t, sb_bad_features2), 0 },
+ { sizeof(xfs_sb_t), 0 }
+};
+
+static DEFINE_MUTEX(xfs_uuid_table_mutex);
+static int xfs_uuid_table_size;
+static uuid_t *xfs_uuid_table;
+
+/*
+ * See if the UUID is unique among mounted XFS filesystems.
+ * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
+ */
+STATIC int
+xfs_uuid_mount(
+ struct xfs_mount *mp)
+{
+ uuid_t *uuid = &mp->m_sb.sb_uuid;
+ int hole, i;
+
+ if (mp->m_flags & XFS_MOUNT_NOUUID)
+ return 0;
+
+ if (uuid_is_nil(uuid)) {
+ xfs_warn(mp, "Filesystem has nil UUID - can't mount");
+ return XFS_ERROR(EINVAL);
+ }
+
+ mutex_lock(&xfs_uuid_table_mutex);
+ for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
+ if (uuid_is_nil(&xfs_uuid_table[i])) {
+ hole = i;
+ continue;
+ }
+ if (uuid_equal(uuid, &xfs_uuid_table[i]))
+ goto out_duplicate;
+ }
+
+ if (hole < 0) {
+ xfs_uuid_table = kmem_realloc(xfs_uuid_table,
+ (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
+ xfs_uuid_table_size * sizeof(*xfs_uuid_table),
+ KM_SLEEP);
+ hole = xfs_uuid_table_size++;
+ }
+ xfs_uuid_table[hole] = *uuid;
+ mutex_unlock(&xfs_uuid_table_mutex);
+
+ return 0;
+
+ out_duplicate:
+ mutex_unlock(&xfs_uuid_table_mutex);
+ xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
+ return XFS_ERROR(EINVAL);
+}
+
+STATIC void
+xfs_uuid_unmount(
+ struct xfs_mount *mp)
+{
+ uuid_t *uuid = &mp->m_sb.sb_uuid;
+ int i;
+
+ if (mp->m_flags & XFS_MOUNT_NOUUID)
+ return;
+
+ mutex_lock(&xfs_uuid_table_mutex);
+ for (i = 0; i < xfs_uuid_table_size; i++) {
+ if (uuid_is_nil(&xfs_uuid_table[i]))
+ continue;
+ if (!uuid_equal(uuid, &xfs_uuid_table[i]))
+ continue;
+ memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
+ break;
+ }
+ ASSERT(i < xfs_uuid_table_size);
+ mutex_unlock(&xfs_uuid_table_mutex);
+}
+
+
+/*
+ * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
+ */
+struct xfs_perag *
+xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+ struct xfs_perag *pag;
+ int ref = 0;
+
+ rcu_read_lock();
+ pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+ if (pag) {
+ ASSERT(atomic_read(&pag->pag_ref) >= 0);
+ ref = atomic_inc_return(&pag->pag_ref);
+ }
+ rcu_read_unlock();
+ trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+ return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t first,
+ int tag)
+{
+ struct xfs_perag *pag;
+ int found;
+ int ref;
+
+ rcu_read_lock();
+ found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+ (void **)&pag, first, 1, tag);
+ if (found <= 0) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ ref = atomic_inc_return(&pag->pag_ref);
+ rcu_read_unlock();
+ trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+ return pag;
+}
+
+void
+xfs_perag_put(struct xfs_perag *pag)
+{
+ int ref;
+
+ ASSERT(atomic_read(&pag->pag_ref) > 0);
+ ref = atomic_dec_return(&pag->pag_ref);
+ trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+STATIC void
+__xfs_free_perag(
+ struct rcu_head *head)
+{
+ struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+
+ ASSERT(atomic_read(&pag->pag_ref) == 0);
+ kmem_free(pag);
+}
+
+/*
+ * Free up the per-ag resources associated with the mount structure.
+ */
+STATIC void
+xfs_free_perag(
+ xfs_mount_t *mp)
+{
+ xfs_agnumber_t agno;
+ struct xfs_perag *pag;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ spin_lock(&mp->m_perag_lock);
+ pag = radix_tree_delete(&mp->m_perag_tree, agno);
+ spin_unlock(&mp->m_perag_lock);
+ ASSERT(pag);
+ ASSERT(atomic_read(&pag->pag_ref) == 0);
+ call_rcu(&pag->rcu_head, __xfs_free_perag);
+ }
+}
+
+/*
+ * Check size of device based on the (data/realtime) block count.
+ * Note: this check is used by the growfs code as well as mount.
+ */
+int
+xfs_sb_validate_fsb_count(
+ xfs_sb_t *sbp,
+ __uint64_t nblocks)
+{
+ ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
+ ASSERT(sbp->sb_blocklog >= BBSHIFT);
+
+#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */
+ if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
+ return EFBIG;
+#else /* Limited by UINT_MAX of sectors */
+ if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
+ return EFBIG;
+#endif
+ return 0;
+}
+
+/*
+ * Check the validity of the SB found.
+ */
+STATIC int
+xfs_mount_validate_sb(
+ xfs_mount_t *mp,
+ xfs_sb_t *sbp,
+ int flags)
+{
+ int loud = !(flags & XFS_MFSI_QUIET);
+
+ /*
+ * If the log device and data device have the
+ * same device number, the log is internal.
+ * Consequently, the sb_logstart should be non-zero. If
+ * we have a zero sb_logstart in this case, we may be trying to mount
+ * a volume filesystem in a non-volume manner.
+ */
+ if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+ if (loud)
+ xfs_warn(mp, "bad magic number");
+ return XFS_ERROR(EWRONGFS);
+ }
+
+ if (!xfs_sb_good_version(sbp)) {
+ if (loud)
+ xfs_warn(mp, "bad version");
+ return XFS_ERROR(EWRONGFS);
+ }
+
+ if (unlikely(
+ sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
+ if (loud)
+ xfs_warn(mp,
+ "filesystem is marked as having an external log; "
+ "specify logdev on the mount command line.");
+ return XFS_ERROR(EINVAL);
+ }
+
+ if (unlikely(
+ sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
+ if (loud)
+ xfs_warn(mp,
+ "filesystem is marked as having an internal log; "
+ "do not specify logdev on the mount command line.");
+ return XFS_ERROR(EINVAL);
+ }
+
+ /*
+ * More sanity checking. These were stolen directly from
+ * xfs_repair.
+ */
+ if (unlikely(
+ sbp->sb_agcount <= 0 ||
+ sbp->sb_sectsize < XFS_MIN_SECTORSIZE ||
+ sbp->sb_sectsize > XFS_MAX_SECTORSIZE ||
+ sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG ||
+ sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG ||
+ sbp->sb_sectsize != (1 << sbp->sb_sectlog) ||
+ sbp->sb_blocksize < XFS_MIN_BLOCKSIZE ||
+ sbp->sb_blocksize > XFS_MAX_BLOCKSIZE ||
+ sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
+ sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
+ sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
+ sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
+ sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
+ sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
+ sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
+ sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
+ (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
+ (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
+ (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
+ (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
+ if (loud)
+ xfs_warn(mp, "SB sanity check 1 failed");
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ /*
+ * Sanity check AG count, size fields against data size field
+ */
+ if (unlikely(
+ sbp->sb_dblocks == 0 ||
+ sbp->sb_dblocks >
+ (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
+ sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
+ sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
+ if (loud)
+ xfs_warn(mp, "SB sanity check 2 failed");
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ /*
+ * Until this is fixed only page-sized or smaller data blocks work.
+ */
+ if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+ if (loud) {
+ xfs_warn(mp,
+ "File system with blocksize %d bytes. "
+ "Only pagesize (%ld) or less will currently work.",
+ sbp->sb_blocksize, PAGE_SIZE);
+ }
+ return XFS_ERROR(ENOSYS);
+ }
+
+ /*
+ * Currently only very few inode sizes are supported.
+ */
+ switch (sbp->sb_inodesize) {
+ case 256:
+ case 512:
+ case 1024:
+ case 2048:
+ break;
+ default:
+ if (loud)
+ xfs_warn(mp, "inode size of %d bytes not supported",
+ sbp->sb_inodesize);
+ return XFS_ERROR(ENOSYS);
+ }
+
+ if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
+ xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
+ if (loud)
+ xfs_warn(mp,
+ "file system too large to be mounted on this system.");
+ return XFS_ERROR(EFBIG);
+ }
+
+ if (unlikely(sbp->sb_inprogress)) {
+ if (loud)
+ xfs_warn(mp, "file system busy");
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ /*
+ * Version 1 directory format has never worked on Linux.
+ */
+ if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
+ if (loud)
+ xfs_warn(mp,
+ "file system using version 1 directory format");
+ return XFS_ERROR(ENOSYS);
+ }
+
+ return 0;
+}
+
+int
+xfs_initialize_perag(
+ xfs_mount_t *mp,
+ xfs_agnumber_t agcount,
+ xfs_agnumber_t *maxagi)
+{
+ xfs_agnumber_t index, max_metadata;
+ xfs_agnumber_t first_initialised = 0;
+ xfs_perag_t *pag;
+ xfs_agino_t agino;
+ xfs_ino_t ino;
+ xfs_sb_t *sbp = &mp->m_sb;
+ int error = -ENOMEM;
+
+ /*
+ * Walk the current per-ag tree so we don't try to initialise AGs
+ * that already exist (growfs case). Allocate and insert all the
+ * AGs we don't find ready for initialisation.
+ */
+ for (index = 0; index < agcount; index++) {
+ pag = xfs_perag_get(mp, index);
+ if (pag) {
+ xfs_perag_put(pag);
+ continue;
+ }
+ if (!first_initialised)
+ first_initialised = index;
+
+ pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+ if (!pag)
+ goto out_unwind;
+ pag->pag_agno = index;
+ pag->pag_mount = mp;
+ spin_lock_init(&pag->pag_ici_lock);
+ mutex_init(&pag->pag_ici_reclaim_lock);
+ INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
+ spin_lock_init(&pag->pag_buf_lock);
+ pag->pag_buf_tree = RB_ROOT;
+
+ if (radix_tree_preload(GFP_NOFS))
+ goto out_unwind;
+
+ spin_lock(&mp->m_perag_lock);
+ if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+ BUG();
+ spin_unlock(&mp->m_perag_lock);
+ radix_tree_preload_end();
+ error = -EEXIST;
+ goto out_unwind;
+ }
+ spin_unlock(&mp->m_perag_lock);
+ radix_tree_preload_end();
+ }
+
+ /*
+ * If we mount with the inode64 option, or no inode overflows
+ * the legacy 32-bit address space clear the inode32 option.
+ */
+ agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+ ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+
+ if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
+ mp->m_flags |= XFS_MOUNT_32BITINODES;
+ else
+ mp->m_flags &= ~XFS_MOUNT_32BITINODES;
+
+ if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+ /*
+ * Calculate how much should be reserved for inodes to meet
+ * the max inode percentage.
+ */
+ if (mp->m_maxicount) {
+ __uint64_t icount;
+
+ icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+ do_div(icount, 100);
+ icount += sbp->sb_agblocks - 1;
+ do_div(icount, sbp->sb_agblocks);
+ max_metadata = icount;
+ } else {
+ max_metadata = agcount;
+ }
+
+ for (index = 0; index < agcount; index++) {
+ ino = XFS_AGINO_TO_INO(mp, index, agino);
+ if (ino > XFS_MAXINUMBER_32) {
+ index++;
+ break;
+ }
+
+ pag = xfs_perag_get(mp, index);
+ pag->pagi_inodeok = 1;
+ if (index < max_metadata)
+ pag->pagf_metadata = 1;
+ xfs_perag_put(pag);
+ }
+ } else {
+ for (index = 0; index < agcount; index++) {
+ pag = xfs_perag_get(mp, index);
+ pag->pagi_inodeok = 1;
+ xfs_perag_put(pag);
+ }
+ }
+
+ if (maxagi)
+ *maxagi = index;
+ return 0;
+
+out_unwind:
+ kmem_free(pag);
+ for (; index > first_initialised; index--) {
+ pag = radix_tree_delete(&mp->m_perag_tree, index);
+ kmem_free(pag);
+ }
+ return error;
+}
+
+void
+xfs_sb_from_disk(
+ xfs_sb_t *to,
+ xfs_dsb_t *from)
+{
+ to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
+ to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
+ to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
+ to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
+ to->sb_rextents = be64_to_cpu(from->sb_rextents);
+ memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+ to->sb_logstart = be64_to_cpu(from->sb_logstart);
+ to->sb_rootino = be64_to_cpu(from->sb_rootino);
+ to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
+ to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
+ to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
+ to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
+ to->sb_agcount = be32_to_cpu(from->sb_agcount);
+ to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
+ to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
+ to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
+ to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
+ to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
+ to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
+ memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+ to->sb_blocklog = from->sb_blocklog;
+ to->sb_sectlog = from->sb_sectlog;
+ to->sb_inodelog = from->sb_inodelog;
+ to->sb_inopblog = from->sb_inopblog;
+ to->sb_agblklog = from->sb_agblklog;
+ to->sb_rextslog = from->sb_rextslog;
+ to->sb_inprogress = from->sb_inprogress;
+ to->sb_imax_pct = from->sb_imax_pct;
+ to->sb_icount = be64_to_cpu(from->sb_icount);
+ to->sb_ifree = be64_to_cpu(from->sb_ifree);
+ to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
+ to->sb_frextents = be64_to_cpu(from->sb_frextents);
+ to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
+ to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
+ to->sb_qflags = be16_to_cpu(from->sb_qflags);
+ to->sb_flags = from->sb_flags;
+ to->sb_shared_vn = from->sb_shared_vn;
+ to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
+ to->sb_unit = be32_to_cpu(from->sb_unit);
+ to->sb_width = be32_to_cpu(from->sb_width);
+ to->sb_dirblklog = from->sb_dirblklog;
+ to->sb_logsectlog = from->sb_logsectlog;
+ to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
+ to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
+ to->sb_features2 = be32_to_cpu(from->sb_features2);
+ to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+}
+
+/*
+ * Copy in core superblock to ondisk one.
+ *
+ * The fields argument is mask of superblock fields to copy.
+ */
+void
+xfs_sb_to_disk(
+ xfs_dsb_t *to,
+ xfs_sb_t *from,
+ __int64_t fields)
+{
+ xfs_caddr_t to_ptr = (xfs_caddr_t)to;
+ xfs_caddr_t from_ptr = (xfs_caddr_t)from;
+ xfs_sb_field_t f;
+ int first;
+ int size;
+
+ ASSERT(fields);
+ if (!fields)
+ return;
+
+ while (fields) {
+ f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+ first = xfs_sb_info[f].offset;
+ size = xfs_sb_info[f + 1].offset - first;
+
+ ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+
+ if (size == 1 || xfs_sb_info[f].type == 1) {
+ memcpy(to_ptr + first, from_ptr + first, size);
+ } else {
+ switch (size) {
+ case 2:
+ *(__be16 *)(to_ptr + first) =
+ cpu_to_be16(*(__u16 *)(from_ptr + first));
+ break;
+ case 4:
+ *(__be32 *)(to_ptr + first) =
+ cpu_to_be32(*(__u32 *)(from_ptr + first));
+ break;
+ case 8:
+ *(__be64 *)(to_ptr + first) =
+ cpu_to_be64(*(__u64 *)(from_ptr + first));
+ break;
+ default:
+ ASSERT(0);
+ }
+ }
+
+ fields &= ~(1LL << f);
+ }
+}
+
+/*
+ * xfs_readsb
+ *
+ * Does the initial read of the superblock.
+ */
+int
+xfs_readsb(xfs_mount_t *mp, int flags)
+{
+ unsigned int sector_size;
+ xfs_buf_t *bp;
+ int error;
+ int loud = !(flags & XFS_MFSI_QUIET);
+
+ ASSERT(mp->m_sb_bp == NULL);
+ ASSERT(mp->m_ddev_targp != NULL);
+
+ /*
+ * Allocate a (locked) buffer to hold the superblock.
+ * This will be kept around at all times to optimize
+ * access to the superblock.
+ */
+ sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+
+reread:
+ bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+ XFS_SB_DADDR, sector_size, 0);
+ if (!bp) {
+ if (loud)
+ xfs_warn(mp, "SB buffer read failed");
+ return EIO;
+ }
+
+ /*
+ * Initialize the mount structure from the superblock.
+ * But first do some basic consistency checking.
+ */
+ xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+ error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
+ if (error) {
+ if (loud)
+ xfs_warn(mp, "SB validate failed");
+ goto release_buf;
+ }
+
+ /*
+ * We must be able to do sector-sized and sector-aligned IO.
+ */
+ if (sector_size > mp->m_sb.sb_sectsize) {
+ if (loud)
+ xfs_warn(mp, "device supports %u byte sectors (not %u)",
+ sector_size, mp->m_sb.sb_sectsize);
+ error = ENOSYS;
+ goto release_buf;
+ }
+
+ /*
+ * If device sector size is smaller than the superblock size,
+ * re-read the superblock so the buffer is correctly sized.
+ */
+ if (sector_size < mp->m_sb.sb_sectsize) {
+ xfs_buf_relse(bp);
+ sector_size = mp->m_sb.sb_sectsize;
+ goto reread;
+ }
+
+ /* Initialize per-cpu counters */
+ xfs_icsb_reinit_counters(mp);
+
+ mp->m_sb_bp = bp;
+ xfs_buf_unlock(bp);
+ return 0;
+
+release_buf:
+ xfs_buf_relse(bp);
+ return error;
+}
+
+
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure
+ */
+STATIC void
+xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
+{
+ mp->m_agfrotor = mp->m_agirotor = 0;
+ spin_lock_init(&mp->m_agirotor_lock);
+ mp->m_maxagi = mp->m_sb.sb_agcount;
+ mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+ mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+ mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
+ mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+ mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+ mp->m_blockmask = sbp->sb_blocksize - 1;
+ mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+ mp->m_blockwmask = mp->m_blockwsize - 1;
+
+ mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+ mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+ mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
+ mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
+
+ mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+ mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+
+ mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+ mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+ sbp->sb_inopblock);
+ mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+}
+
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+STATIC int
+xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
+{
+ xfs_agnumber_t index;
+ xfs_perag_t *pag;
+ xfs_sb_t *sbp = &mp->m_sb;
+ uint64_t ifree = 0;
+ uint64_t ialloc = 0;
+ uint64_t bfree = 0;
+ uint64_t bfreelst = 0;
+ uint64_t btree = 0;
+ int error;
+
+ for (index = 0; index < agcount; index++) {
+ /*
+ * read the agf, then the agi. This gets us
+ * all the information we need and populates the
+ * per-ag structures for us.
+ */
+ error = xfs_alloc_pagf_init(mp, NULL, index, 0);
+ if (error)
+ return error;
+
+ error = xfs_ialloc_pagi_init(mp, NULL, index);
+ if (error)
+ return error;
+ pag = xfs_perag_get(mp, index);
+ ifree += pag->pagi_freecount;
+ ialloc += pag->pagi_count;
+ bfree += pag->pagf_freeblks;
+ bfreelst += pag->pagf_flcount;
+ btree += pag->pagf_btreeblks;
+ xfs_perag_put(pag);
+ }
+ /*
+ * Overwrite incore superblock counters with just-read data
+ */
+ spin_lock(&mp->m_sb_lock);
+ sbp->sb_ifree = ifree;
+ sbp->sb_icount = ialloc;
+ sbp->sb_fdblocks = bfree + bfreelst + btree;
+ spin_unlock(&mp->m_sb_lock);
+
+ /* Fixup the per-cpu counters as well. */
+ xfs_icsb_reinit_counters(mp);
+
+ return 0;
+}
+
+/*
+ * Update alignment values based on mount options and sb values
+ */
+STATIC int
+xfs_update_alignment(xfs_mount_t *mp)
+{
+ xfs_sb_t *sbp = &(mp->m_sb);
+
+ if (mp->m_dalign) {
+ /*
+ * If stripe unit and stripe width are not multiples
+ * of the fs blocksize turn off alignment.
+ */
+ if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
+ (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+ if (mp->m_flags & XFS_MOUNT_RETERR) {
+ xfs_warn(mp, "alignment check 1 failed");
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_dalign = mp->m_swidth = 0;
+ } else {
+ /*
+ * Convert the stripe unit and width to FSBs.
+ */
+ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+ if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
+ if (mp->m_flags & XFS_MOUNT_RETERR) {
+ return XFS_ERROR(EINVAL);
+ }
+ xfs_warn(mp,
+ "stripe alignment turned off: sunit(%d)/swidth(%d) "
+ "incompatible with agsize(%d)",
+ mp->m_dalign, mp->m_swidth,
+ sbp->sb_agblocks);
+
+ mp->m_dalign = 0;
+ mp->m_swidth = 0;
+ } else if (mp->m_dalign) {
+ mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
+ } else {
+ if (mp->m_flags & XFS_MOUNT_RETERR) {
+ xfs_warn(mp,
+ "stripe alignment turned off: sunit(%d) less than bsize(%d)",
+ mp->m_dalign,
+ mp->m_blockmask +1);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_swidth = 0;
+ }
+ }
+
+ /*
+ * Update superblock with new values
+ * and log changes
+ */
+ if (xfs_sb_version_hasdalign(sbp)) {
+ if (sbp->sb_unit != mp->m_dalign) {
+ sbp->sb_unit = mp->m_dalign;
+ mp->m_update_flags |= XFS_SB_UNIT;
+ }
+ if (sbp->sb_width != mp->m_swidth) {
+ sbp->sb_width = mp->m_swidth;
+ mp->m_update_flags |= XFS_SB_WIDTH;
+ }
+ }
+ } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
+ xfs_sb_version_hasdalign(&mp->m_sb)) {
+ mp->m_dalign = sbp->sb_unit;
+ mp->m_swidth = sbp->sb_width;
+ }
+
+ return 0;
+}
+
+/*
+ * Set the maximum inode count for this filesystem
+ */
+STATIC void
+xfs_set_maxicount(xfs_mount_t *mp)
+{
+ xfs_sb_t *sbp = &(mp->m_sb);
+ __uint64_t icount;
+
+ if (sbp->sb_imax_pct) {
+ /*
+ * Make sure the maximum inode count is a multiple
+ * of the units we allocate inodes in.
+ */
+ icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+ do_div(icount, 100);
+ do_div(icount, mp->m_ialloc_blks);
+ mp->m_maxicount = (icount * mp->m_ialloc_blks) <<
+ sbp->sb_inopblog;
+ } else {
+ mp->m_maxicount = 0;
+ }
+}
+
+/*
+ * Set the default minimum read and write sizes unless
+ * already specified in a mount option.
+ * We use smaller I/O sizes when the file system
+ * is being used for NFS service (wsync mount option).
+ */
+STATIC void
+xfs_set_rw_sizes(xfs_mount_t *mp)
+{
+ xfs_sb_t *sbp = &(mp->m_sb);
+ int readio_log, writeio_log;
+
+ if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ readio_log = XFS_WSYNC_READIO_LOG;
+ writeio_log = XFS_WSYNC_WRITEIO_LOG;
+ } else {
+ readio_log = XFS_READIO_LOG_LARGE;
+ writeio_log = XFS_WRITEIO_LOG_LARGE;
+ }
+ } else {
+ readio_log = mp->m_readio_log;
+ writeio_log = mp->m_writeio_log;
+ }
+
+ if (sbp->sb_blocklog > readio_log) {
+ mp->m_readio_log = sbp->sb_blocklog;
+ } else {
+ mp->m_readio_log = readio_log;
+ }
+ mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
+ if (sbp->sb_blocklog > writeio_log) {
+ mp->m_writeio_log = sbp->sb_blocklog;
+ } else {
+ mp->m_writeio_log = writeio_log;
+ }
+ mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
+}
+
+/*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+ struct xfs_mount *mp)
+{
+ int i;
+
+ for (i = 0; i < XFS_LOWSP_MAX; i++) {
+ __uint64_t space = mp->m_sb.sb_dblocks;
+
+ do_div(space, 100);
+ mp->m_low_space[i] = space * (i + 1);
+ }
+}
+
+
+/*
+ * Set whether we're using inode alignment.
+ */
+STATIC void
+xfs_set_inoalignment(xfs_mount_t *mp)
+{
+ if (xfs_sb_version_hasalign(&mp->m_sb) &&
+ mp->m_sb.sb_inoalignmt >=
+ XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
+ mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
+ else
+ mp->m_inoalign_mask = 0;
+ /*
+ * If we are using stripe alignment, check whether
+ * the stripe unit is a multiple of the inode alignment
+ */
+ if (mp->m_dalign && mp->m_inoalign_mask &&
+ !(mp->m_dalign & mp->m_inoalign_mask))
+ mp->m_sinoalign = mp->m_dalign;
+ else
+ mp->m_sinoalign = 0;
+}
+
+/*
+ * Check that the data (and log if separate) are an ok size.
+ */
+STATIC int
+xfs_check_sizes(xfs_mount_t *mp)
+{
+ xfs_buf_t *bp;
+ xfs_daddr_t d;
+
+ d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+ if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
+ xfs_warn(mp, "filesystem size mismatch detected");
+ return XFS_ERROR(EFBIG);
+ }
+ bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
+ d - XFS_FSS_TO_BB(mp, 1),
+ BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
+ if (!bp) {
+ xfs_warn(mp, "last sector read failed");
+ return EIO;
+ }
+ xfs_buf_relse(bp);
+
+ if (mp->m_logdev_targp != mp->m_ddev_targp) {
+ d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+ if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
+ xfs_warn(mp, "log size mismatch detected");
+ return XFS_ERROR(EFBIG);
+ }
+ bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
+ d - XFS_FSB_TO_BB(mp, 1),
+ XFS_FSB_TO_B(mp, 1), 0);
+ if (!bp) {
+ xfs_warn(mp, "log device read failed");
+ return EIO;
+ }
+ xfs_buf_relse(bp);
+ }
+ return 0;
+}
+
+/*
+ * Clear the quotaflags in memory and in the superblock.
+ */
+int
+xfs_mount_reset_sbqflags(
+ struct xfs_mount *mp)
+{
+ int error;
+ struct xfs_trans *tp;
+
+ mp->m_qflags = 0;
+
+ /*
+ * It is OK to look at sb_qflags here in mount path,
+ * without m_sb_lock.
+ */
+ if (mp->m_sb.sb_qflags == 0)
+ return 0;
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags = 0;
+ spin_unlock(&mp->m_sb_lock);
+
+ /*
+ * If the fs is readonly, let the incore superblock run
+ * with quotas off but don't flush the update out to disk
+ */
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return 0;
+
+#ifdef QUOTADEBUG
+ xfs_notice(mp, "Writing superblock quota changes");
+#endif
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+ error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+ XFS_DEFAULT_LOG_COUNT);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ xfs_alert(mp, "%s: Superblock update failed!", __func__);
+ return error;
+ }
+
+ xfs_mod_sb(tp, XFS_SB_QFLAGS);
+ return xfs_trans_commit(tp, 0);
+}
+
+__uint64_t
+xfs_default_resblks(xfs_mount_t *mp)
+{
+ __uint64_t resblks;
+
+ /*
+ * We default to 5% or 8192 fsbs of space reserved, whichever is
+ * smaller. This is intended to cover concurrent allocation
+ * transactions when we initially hit enospc. These each require a 4
+ * block reservation. Hence by default we cover roughly 2000 concurrent
+ * allocation reservations.
+ */
+ resblks = mp->m_sb.sb_dblocks;
+ do_div(resblks, 20);
+ resblks = min_t(__uint64_t, resblks, 8192);
+ return resblks;
+}
+
+/*
+ * This function does the following on an initial mount of a file system:
+ * - reads the superblock from disk and init the mount struct
+ * - if we're a 32-bit kernel, do a size check on the superblock
+ * so we don't mount terabyte filesystems
+ * - init mount struct realtime fields
+ * - allocate inode hash table for fs
+ * - init directory manager
+ * - perform recovery and init the log manager
+ */
+int
+xfs_mountfs(
+ xfs_mount_t *mp)
+{
+ xfs_sb_t *sbp = &(mp->m_sb);
+ xfs_inode_t *rip;
+ __uint64_t resblks;
+ uint quotamount = 0;
+ uint quotaflags = 0;
+ int error = 0;
+
+ xfs_mount_common(mp, sbp);
+
+ /*
+ * Check for a mismatched features2 values. Older kernels
+ * read & wrote into the wrong sb offset for sb_features2
+ * on some platforms due to xfs_sb_t not being 64bit size aligned
+ * when sb_features2 was added, which made older superblock
+ * reading/writing routines swap it as a 64-bit value.
+ *
+ * For backwards compatibility, we make both slots equal.
+ *
+ * If we detect a mismatched field, we OR the set bits into the
+ * existing features2 field in case it has already been modified; we
+ * don't want to lose any features. We then update the bad location
+ * with the ORed value so that older kernels will see any features2
+ * flags, and mark the two fields as needing updates once the
+ * transaction subsystem is online.
+ */
+ if (xfs_sb_has_mismatched_features2(sbp)) {
+ xfs_warn(mp, "correcting sb_features alignment problem");
+ sbp->sb_features2 |= sbp->sb_bad_features2;
+ sbp->sb_bad_features2 = sbp->sb_features2;
+ mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
+
+ /*
+ * Re-check for ATTR2 in case it was found in bad_features2
+ * slot.
+ */
+ if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+ !(mp->m_flags & XFS_MOUNT_NOATTR2))
+ mp->m_flags |= XFS_MOUNT_ATTR2;
+ }
+
+ if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+ (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+ xfs_sb_version_removeattr2(&mp->m_sb);
+ mp->m_update_flags |= XFS_SB_FEATURES2;
+
+ /* update sb_versionnum for the clearing of the morebits */
+ if (!sbp->sb_features2)
+ mp->m_update_flags |= XFS_SB_VERSIONNUM;
+ }
+
+ /*
+ * Check if sb_agblocks is aligned at stripe boundary
+ * If sb_agblocks is NOT aligned turn off m_dalign since
+ * allocator alignment is within an ag, therefore ag has
+ * to be aligned at stripe boundary.
+ */
+ error = xfs_update_alignment(mp);
+ if (error)
+ goto out;
+
+ xfs_alloc_compute_maxlevels(mp);
+ xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
+ xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
+ xfs_ialloc_compute_maxlevels(mp);
+
+ xfs_set_maxicount(mp);
+
+ mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
+
+ error = xfs_uuid_mount(mp);
+ if (error)
+ goto out;
+
+ /*
+ * Set the minimum read and write sizes
+ */
+ xfs_set_rw_sizes(mp);
+
+ /* set the low space thresholds for dynamic preallocation */
+ xfs_set_low_space_thresholds(mp);
+
+ /*
+ * Set the inode cluster size.
+ * This may still be overridden by the file system
+ * block size if it is larger than the chosen cluster size.
+ */
+ mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
+
+ /*
+ * Set inode alignment fields
+ */
+ xfs_set_inoalignment(mp);
+
+ /*
+ * Check that the data (and log if separate) are an ok size.
+ */
+ error = xfs_check_sizes(mp);
+ if (error)
+ goto out_remove_uuid;
+
+ /*
+ * Initialize realtime fields in the mount structure
+ */
+ error = xfs_rtmount_init(mp);
+ if (error) {
+ xfs_warn(mp, "RT mount failed");
+ goto out_remove_uuid;
+ }
+
+ /*
+ * Copies the low order bits of the timestamp and the randomly
+ * set "sequence" number out of a UUID.
+ */
+ uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
+
+ mp->m_dmevmask = 0; /* not persistent; set after each mount */
+
+ xfs_dir_mount(mp);
+
+ /*
+ * Initialize the attribute manager's entries.
+ */
+ mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
+
+ /*
+ * Initialize the precomputed transaction reservations values.
+ */
+ xfs_trans_init(mp);
+
+ /*
+ * Allocate and initialize the per-ag data.
+ */
+ spin_lock_init(&mp->m_perag_lock);
+ INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
+ error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+ if (error) {
+ xfs_warn(mp, "Failed per-ag init: %d", error);
+ goto out_remove_uuid;
+ }
+
+ if (!sbp->sb_logblocks) {
+ xfs_warn(mp, "no log defined");
+ XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto out_free_perag;
+ }
+
+ /*
+ * log's mount-time initialization. Perform 1st part recovery if needed
+ */
+ error = xfs_log_mount(mp, mp->m_logdev_targp,
+ XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
+ XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
+ if (error) {
+ xfs_warn(mp, "log mount failed");
+ goto out_free_perag;
+ }
+
+ /*
+ * Now the log is mounted, we know if it was an unclean shutdown or
+ * not. If it was, with the first phase of recovery has completed, we
+ * have consistent AG blocks on disk. We have not recovered EFIs yet,
+ * but they are recovered transactionally in the second recovery phase
+ * later.
+ *
+ * Hence we can safely re-initialise incore superblock counters from
+ * the per-ag data. These may not be correct if the filesystem was not
+ * cleanly unmounted, so we need to wait for recovery to finish before
+ * doing this.
+ *
+ * If the filesystem was cleanly unmounted, then we can trust the
+ * values in the superblock to be correct and we don't need to do
+ * anything here.
+ *
+ * If we are currently making the filesystem, the initialisation will
+ * fail as the perag data is in an undefined state.
+ */
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+ !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
+ !mp->m_sb.sb_inprogress) {
+ error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
+ if (error)
+ goto out_free_perag;
+ }
+
+ /*
+ * Get and sanity-check the root inode.
+ * Save the pointer to it in the mount structure.
+ */
+ error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
+ if (error) {
+ xfs_warn(mp, "failed to read root inode");
+ goto out_log_dealloc;
+ }
+
+ ASSERT(rip != NULL);
+
+ if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
+ xfs_warn(mp, "corrupted root inode %llu: not a directory",
+ (unsigned long long)rip->i_ino);
+ xfs_iunlock(rip, XFS_ILOCK_EXCL);
+ XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
+ mp);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto out_rele_rip;
+ }
+ mp->m_rootip = rip; /* save it */
+
+ xfs_iunlock(rip, XFS_ILOCK_EXCL);
+
+ /*
+ * Initialize realtime inode pointers in the mount structure
+ */
+ error = xfs_rtmount_inodes(mp);
+ if (error) {
+ /*
+ * Free up the root inode.
+ */
+ xfs_warn(mp, "failed to read RT inodes");
+ goto out_rele_rip;
+ }
+
+ /*
+ * If this is a read-only mount defer the superblock updates until
+ * the next remount into writeable mode. Otherwise we would never
+ * perform the update e.g. for the root filesystem.
+ */
+ if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ error = xfs_mount_log_sb(mp, mp->m_update_flags);
+ if (error) {
+ xfs_warn(mp, "failed to write sb changes");
+ goto out_rtunmount;
+ }
+ }
+
+ /*
+ * Initialise the XFS quota management subsystem for this mount
+ */
+ if (XFS_IS_QUOTA_RUNNING(mp)) {
+ error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
+ if (error)
+ goto out_rtunmount;
+ } else {
+ ASSERT(!XFS_IS_QUOTA_ON(mp));
+
+ /*
+ * If a file system had quotas running earlier, but decided to
+ * mount without -o uquota/pquota/gquota options, revoke the
+ * quotachecked license.
+ */
+ if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
+ xfs_notice(mp, "resetting quota flags");
+ error = xfs_mount_reset_sbqflags(mp);
+ if (error)
+ return error;
+ }
+ }
+
+ /*
+ * Finish recovering the file system. This part needed to be
+ * delayed until after the root and real-time bitmap inodes
+ * were consistently read in.
+ */
+ error = xfs_log_mount_finish(mp);
+ if (error) {
+ xfs_warn(mp, "log mount finish failed");
+ goto out_rtunmount;
+ }
+
+ /*
+ * Complete the quota initialisation, post-log-replay component.
+ */
+ if (quotamount) {
+ ASSERT(mp->m_qflags == 0);
+ mp->m_qflags = quotaflags;
+
+ xfs_qm_mount_quotas(mp);
+ }
+
+ /*
+ * Now we are mounted, reserve a small amount of unused space for
+ * privileged transactions. This is needed so that transaction
+ * space required for critical operations can dip into this pool
+ * when at ENOSPC. This is needed for operations like create with
+ * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
+ * are not allowed to use this reserved space.
+ *
+ * This may drive us straight to ENOSPC on mount, but that implies
+ * we were already there on the last unmount. Warn if this occurs.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ resblks = xfs_default_resblks(mp);
+ error = xfs_reserve_blocks(mp, &resblks, NULL);
+ if (error)
+ xfs_warn(mp,
+ "Unable to allocate reserve blocks. Continuing without reserve pool.");
+ }
+
+ return 0;
+
+ out_rtunmount:
+ xfs_rtunmount_inodes(mp);
+ out_rele_rip:
+ IRELE(rip);
+ out_log_dealloc:
+ xfs_log_unmount(mp);
+ out_free_perag:
+ xfs_free_perag(mp);
+ out_remove_uuid:
+ xfs_uuid_unmount(mp);
+ out:
+ return error;
+}
+
+/*
+ * This flushes out the inodes,dquots and the superblock, unmounts the
+ * log and makes sure that incore structures are freed.
+ */
+void
+xfs_unmountfs(
+ struct xfs_mount *mp)
+{
+ __uint64_t resblks;
+ int error;
+
+ xfs_qm_unmount_quotas(mp);
+ xfs_rtunmount_inodes(mp);
+ IRELE(mp->m_rootip);
+
+ /*
+ * We can potentially deadlock here if we have an inode cluster
+ * that has been freed has its buffer still pinned in memory because
+ * the transaction is still sitting in a iclog. The stale inodes
+ * on that buffer will have their flush locks held until the
+ * transaction hits the disk and the callbacks run. the inode
+ * flush takes the flush lock unconditionally and with nothing to
+ * push out the iclog we will never get that unlocked. hence we
+ * need to force the log first.
+ */
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /*
+ * Do a delwri reclaim pass first so that as many dirty inodes are
+ * queued up for IO as possible. Then flush the buffers before making
+ * a synchronous path to catch all the remaining inodes are reclaimed.
+ * This makes the reclaim process as quick as possible by avoiding
+ * synchronous writeout and blocking on inodes already in the delwri
+ * state as much as possible.
+ */
+ xfs_reclaim_inodes(mp, 0);
+ XFS_bflush(mp->m_ddev_targp);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+ xfs_qm_unmount(mp);
+
+ /*
+ * Flush out the log synchronously so that we know for sure
+ * that nothing is pinned. This is important because bflush()
+ * will skip pinned buffers.
+ */
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /*
+ * Unreserve any blocks we have so that when we unmount we don't account
+ * the reserved free space as used. This is really only necessary for
+ * lazy superblock counting because it trusts the incore superblock
+ * counters to be absolutely correct on clean unmount.
+ *
+ * We don't bother correcting this elsewhere for lazy superblock
+ * counting because on mount of an unclean filesystem we reconstruct the
+ * correct counter value and this is irrelevant.
+ *
+ * For non-lazy counter filesystems, this doesn't matter at all because
+ * we only every apply deltas to the superblock and hence the incore
+ * value does not matter....
+ */
+ resblks = 0;
+ error = xfs_reserve_blocks(mp, &resblks, NULL);
+ if (error)
+ xfs_warn(mp, "Unable to free reserved block pool. "
+ "Freespace may not be correct on next mount.");
+
+ error = xfs_log_sbcount(mp, 1);
+ if (error)
+ xfs_warn(mp, "Unable to update superblock counters. "
+ "Freespace may not be correct on next mount.");
+ xfs_unmountfs_writesb(mp);
+
+ /*
+ * Make sure all buffers have been flushed and completed before
+ * unmounting the log.
+ */
+ error = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+ if (error)
+ xfs_warn(mp, "%d busy buffers during unmount.", error);
+ xfs_wait_buftarg(mp->m_ddev_targp);
+
+ xfs_log_unmount_write(mp);
+ xfs_log_unmount(mp);
+ xfs_uuid_unmount(mp);
+
+#if defined(DEBUG)
+ xfs_errortag_clearall(mp, 0);
+#endif
+ xfs_free_perag(mp);
+}
+
+int
+xfs_fs_writable(xfs_mount_t *mp)
+{
+ return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) ||
+ (mp->m_flags & XFS_MOUNT_RDONLY));
+}
+
+/*
+ * xfs_log_sbcount
+ *
+ * Called either periodically to keep the on disk superblock values
+ * roughly up to date or from unmount to make sure the values are
+ * correct on a clean unmount.
+ *
+ * Note this code can be called during the process of freezing, so
+ * we may need to use the transaction allocator which does not not
+ * block when the transaction subsystem is in its frozen state.
+ */
+int
+xfs_log_sbcount(
+ xfs_mount_t *mp,
+ uint sync)
+{
+ xfs_trans_t *tp;
+ int error;
+
+ if (!xfs_fs_writable(mp))
+ return 0;
+
+ xfs_icsb_sync_counters(mp, 0);
+
+ /*
+ * we don't need to do this if we are updating the superblock
+ * counters on every modification.
+ */
+ if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
+ return 0;
+
+ tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
+ error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+ XFS_DEFAULT_LOG_COUNT);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
+ if (sync)
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp, 0);
+ return error;
+}
+
+int
+xfs_unmountfs_writesb(xfs_mount_t *mp)
+{
+ xfs_buf_t *sbp;
+ int error = 0;
+
+ /*
+ * skip superblock write if fs is read-only, or
+ * if we are doing a forced umount.
+ */
+ if (!((mp->m_flags & XFS_MOUNT_RDONLY) ||
+ XFS_FORCED_SHUTDOWN(mp))) {
+
+ sbp = xfs_getsb(mp, 0);
+
+ XFS_BUF_UNDONE(sbp);
+ XFS_BUF_UNREAD(sbp);
+ XFS_BUF_UNDELAYWRITE(sbp);
+ XFS_BUF_WRITE(sbp);
+ XFS_BUF_UNASYNC(sbp);
+ ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
+ xfsbdstrat(mp, sbp);
+ error = xfs_buf_iowait(sbp);
+ if (error)
+ xfs_ioerror_alert("xfs_unmountfs_writesb",
+ mp, sbp, XFS_BUF_ADDR(sbp));
+ xfs_buf_relse(sbp);
+ }
+ return error;
+}
+
+/*
+ * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * in-core superblock into the superblock buffer to be logged.
+ * It does not provide the higher level of locking that is
+ * needed to protect the in-core superblock from concurrent
+ * access.
+ */
+void
+xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+{
+ xfs_buf_t *bp;
+ int first;
+ int last;
+ xfs_mount_t *mp;
+ xfs_sb_field_t f;
+
+ ASSERT(fields);
+ if (!fields)
+ return;
+ mp = tp->t_mountp;
+ bp = xfs_trans_getsb(tp, mp, 0);
+ first = sizeof(xfs_sb_t);
+ last = 0;
+
+ /* translate/copy */
+
+ xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
+
+ /* find modified range */
+ f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+ ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+ last = xfs_sb_info[f + 1].offset - 1;
+
+ f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+ ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+ first = xfs_sb_info[f].offset;
+
+ xfs_trans_log_buf(tp, bp, first, last);
+}
+
+
+/*
+ * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
+ * a delta to a specified field in the in-core superblock. Simply
+ * switch on the field indicated and apply the delta to that field.
+ * Fields are not allowed to dip below zero, so if the delta would
+ * do this do not apply it and return EINVAL.
+ *
+ * The m_sb_lock must be held when this routine is called.
+ */
+STATIC int
+xfs_mod_incore_sb_unlocked(
+ xfs_mount_t *mp,
+ xfs_sb_field_t field,
+ int64_t delta,
+ int rsvd)
+{
+ int scounter; /* short counter for 32 bit fields */
+ long long lcounter; /* long counter for 64 bit fields */
+ long long res_used, rem;
+
+ /*
+ * With the in-core superblock spin lock held, switch
+ * on the indicated field. Apply the delta to the
+ * proper field. If the fields value would dip below
+ * 0, then do not apply the delta and return EINVAL.
+ */
+ switch (field) {
+ case XFS_SBS_ICOUNT:
+ lcounter = (long long)mp->m_sb.sb_icount;
+ lcounter += delta;
+ if (lcounter < 0) {
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_sb.sb_icount = lcounter;
+ return 0;
+ case XFS_SBS_IFREE:
+ lcounter = (long long)mp->m_sb.sb_ifree;
+ lcounter += delta;
+ if (lcounter < 0) {
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_sb.sb_ifree = lcounter;
+ return 0;
+ case XFS_SBS_FDBLOCKS:
+ lcounter = (long long)
+ mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+ res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
+
+ if (delta > 0) { /* Putting blocks back */
+ if (res_used > delta) {
+ mp->m_resblks_avail += delta;
+ } else {
+ rem = delta - res_used;
+ mp->m_resblks_avail = mp->m_resblks;
+ lcounter += rem;
+ }
+ } else { /* Taking blocks away */
+ lcounter += delta;
+ if (lcounter >= 0) {
+ mp->m_sb.sb_fdblocks = lcounter +
+ XFS_ALLOC_SET_ASIDE(mp);
+ return 0;
+ }
+
+ /*
+ * We are out of blocks, use any available reserved
+ * blocks if were allowed to.
+ */
+ if (!rsvd)
+ return XFS_ERROR(ENOSPC);
+
+ lcounter = (long long)mp->m_resblks_avail + delta;
+ if (lcounter >= 0) {
+ mp->m_resblks_avail = lcounter;
+ return 0;
+ }
+ printk_once(KERN_WARNING
+ "Filesystem \"%s\": reserve blocks depleted! "
+ "Consider increasing reserve pool size.",
+ mp->m_fsname);
+ return XFS_ERROR(ENOSPC);
+ }
+
+ mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
+ return 0;
+ case XFS_SBS_FREXTENTS:
+ lcounter = (long long)mp->m_sb.sb_frextents;
+ lcounter += delta;
+ if (lcounter < 0) {
+ return XFS_ERROR(ENOSPC);
+ }
+ mp->m_sb.sb_frextents = lcounter;
+ return 0;
+ case XFS_SBS_DBLOCKS:
+ lcounter = (long long)mp->m_sb.sb_dblocks;
+ lcounter += delta;
+ if (lcounter < 0) {
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_sb.sb_dblocks = lcounter;
+ return 0;
+ case XFS_SBS_AGCOUNT:
+ scounter = mp->m_sb.sb_agcount;
+ scounter += delta;
+ if (scounter < 0) {
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_sb.sb_agcount = scounter;
+ return 0;
+ case XFS_SBS_IMAX_PCT:
+ scounter = mp->m_sb.sb_imax_pct;
+ scounter += delta;
+ if (scounter < 0) {
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_sb.sb_imax_pct = scounter;
+ return 0;
+ case XFS_SBS_REXTSIZE:
+ scounter = mp->m_sb.sb_rextsize;
+ scounter += delta;
+ if (scounter < 0) {
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_sb.sb_rextsize = scounter;
+ return 0;
+ case XFS_SBS_RBMBLOCKS:
+ scounter = mp->m_sb.sb_rbmblocks;
+ scounter += delta;
+ if (scounter < 0) {
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_sb.sb_rbmblocks = scounter;
+ return 0;
+ case XFS_SBS_RBLOCKS:
+ lcounter = (long long)mp->m_sb.sb_rblocks;
+ lcounter += delta;
+ if (lcounter < 0) {
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_sb.sb_rblocks = lcounter;
+ return 0;
+ case XFS_SBS_REXTENTS:
+ lcounter = (long long)mp->m_sb.sb_rextents;
+ lcounter += delta;
+ if (lcounter < 0) {
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_sb.sb_rextents = lcounter;
+ return 0;
+ case XFS_SBS_REXTSLOG:
+ scounter = mp->m_sb.sb_rextslog;
+ scounter += delta;
+ if (scounter < 0) {
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_sb.sb_rextslog = scounter;
+ return 0;
+ default:
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+}
+
+/*
+ * xfs_mod_incore_sb() is used to change a field in the in-core
+ * superblock structure by the specified delta. This modification
+ * is protected by the m_sb_lock. Just use the xfs_mod_incore_sb_unlocked()
+ * routine to do the work.
+ */
+int
+xfs_mod_incore_sb(
+ struct xfs_mount *mp,
+ xfs_sb_field_t field,
+ int64_t delta,
+ int rsvd)
+{
+ int status;
+
+#ifdef HAVE_PERCPU_SB
+ ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
+#endif
+ spin_lock(&mp->m_sb_lock);
+ status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+ spin_unlock(&mp->m_sb_lock);
+
+ return status;
+}
+
+/*
+ * Change more than one field in the in-core superblock structure at a time.
+ *
+ * The fields and changes to those fields are specified in the array of
+ * xfs_mod_sb structures passed in. Either all of the specified deltas
+ * will be applied or none of them will. If any modified field dips below 0,
+ * then all modifications will be backed out and EINVAL will be returned.
+ *
+ * Note that this function may not be used for the superblock values that
+ * are tracked with the in-memory per-cpu counters - a direct call to
+ * xfs_icsb_modify_counters is required for these.
+ */
+int
+xfs_mod_incore_sb_batch(
+ struct xfs_mount *mp,
+ xfs_mod_sb_t *msb,
+ uint nmsb,
+ int rsvd)
+{
+ xfs_mod_sb_t *msbp;
+ int error = 0;
+
+ /*
+ * Loop through the array of mod structures and apply each individually.
+ * If any fail, then back out all those which have already been applied.
+ * Do all of this within the scope of the m_sb_lock so that all of the
+ * changes will be atomic.
+ */
+ spin_lock(&mp->m_sb_lock);
+ for (msbp = msb; msbp < (msb + nmsb); msbp++) {
+ ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
+ msbp->msb_field > XFS_SBS_FDBLOCKS);
+
+ error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+ msbp->msb_delta, rsvd);
+ if (error)
+ goto unwind;
+ }
+ spin_unlock(&mp->m_sb_lock);
+ return 0;
+
+unwind:
+ while (--msbp >= msb) {
+ error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+ -msbp->msb_delta, rsvd);
+ ASSERT(error == 0);
+ }
+ spin_unlock(&mp->m_sb_lock);
+ return error;
+}
+
+/*
+ * xfs_getsb() is called to obtain the buffer for the superblock.
+ * The buffer is returned locked and read in from disk.
+ * The buffer should be released with a call to xfs_brelse().
+ *
+ * If the flags parameter is BUF_TRYLOCK, then we'll only return
+ * the superblock buffer if it can be locked without sleeping.
+ * If it can't then we'll return NULL.
+ */
+xfs_buf_t *
+xfs_getsb(
+ xfs_mount_t *mp,
+ int flags)
+{
+ xfs_buf_t *bp;
+
+ ASSERT(mp->m_sb_bp != NULL);
+ bp = mp->m_sb_bp;
+ if (flags & XBF_TRYLOCK) {
+ if (!XFS_BUF_CPSEMA(bp)) {
+ return NULL;
+ }
+ } else {
+ XFS_BUF_PSEMA(bp, PRIBIO);
+ }
+ XFS_BUF_HOLD(bp);
+ ASSERT(XFS_BUF_ISDONE(bp));
+ return bp;
+}
+
+/*
+ * Used to free the superblock along various error paths.
+ */
+void
+xfs_freesb(
+ struct xfs_mount *mp)
+{
+ struct xfs_buf *bp = mp->m_sb_bp;
+
+ xfs_buf_lock(bp);
+ mp->m_sb_bp = NULL;
+ xfs_buf_relse(bp);
+}
+
+/*
+ * Used to log changes to the superblock unit and width fields which could
+ * be altered by the mount options, as well as any potential sb_features2
+ * fixup. Only the first superblock is updated.
+ */
+int
+xfs_mount_log_sb(
+ xfs_mount_t *mp,
+ __int64_t fields)
+{
+ xfs_trans_t *tp;
+ int error;
+
+ ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
+ XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 |
+ XFS_SB_VERSIONNUM));
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
+ error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+ XFS_DEFAULT_LOG_COUNT);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+ xfs_mod_sb(tp, fields);
+ error = xfs_trans_commit(tp, 0);
+ return error;
+}
+
+/*
+ * If the underlying (data/log/rt) device is readonly, there are some
+ * operations that cannot proceed.
+ */
+int
+xfs_dev_is_read_only(
+ struct xfs_mount *mp,
+ char *message)
+{
+ if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
+ xfs_readonly_buftarg(mp->m_logdev_targp) ||
+ (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
+ xfs_notice(mp, "%s required on read-only device.", message);
+ xfs_notice(mp, "write access unavailable, cannot proceed.");
+ return EROFS;
+ }
+ return 0;
+}
+
+#ifdef HAVE_PERCPU_SB
+/*
+ * Per-cpu incore superblock counters
+ *
+ * Simple concept, difficult implementation
+ *
+ * Basically, replace the incore superblock counters with a distributed per cpu
+ * counter for contended fields (e.g. free block count).
+ *
+ * Difficulties arise in that the incore sb is used for ENOSPC checking, and
+ * hence needs to be accurately read when we are running low on space. Hence
+ * there is a method to enable and disable the per-cpu counters based on how
+ * much "stuff" is available in them.
+ *
+ * Basically, a counter is enabled if there is enough free resource to justify
+ * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
+ * ENOSPC), then we disable the counters to synchronise all callers and
+ * re-distribute the available resources.
+ *
+ * If, once we redistributed the available resources, we still get a failure,
+ * we disable the per-cpu counter and go through the slow path.
+ *
+ * The slow path is the current xfs_mod_incore_sb() function. This means that
+ * when we disable a per-cpu counter, we need to drain its resources back to
+ * the global superblock. We do this after disabling the counter to prevent
+ * more threads from queueing up on the counter.
+ *
+ * Essentially, this means that we still need a lock in the fast path to enable
+ * synchronisation between the global counters and the per-cpu counters. This
+ * is not a problem because the lock will be local to a CPU almost all the time
+ * and have little contention except when we get to ENOSPC conditions.
+ *
+ * Basically, this lock becomes a barrier that enables us to lock out the fast
+ * path while we do things like enabling and disabling counters and
+ * synchronising the counters.
+ *
+ * Locking rules:
+ *
+ * 1. m_sb_lock before picking up per-cpu locks
+ * 2. per-cpu locks always picked up via for_each_online_cpu() order
+ * 3. accurate counter sync requires m_sb_lock + per cpu locks
+ * 4. modifying per-cpu counters requires holding per-cpu lock
+ * 5. modifying global counters requires holding m_sb_lock
+ * 6. enabling or disabling a counter requires holding the m_sb_lock
+ * and _none_ of the per-cpu locks.
+ *
+ * Disabled counters are only ever re-enabled by a balance operation
+ * that results in more free resources per CPU than a given threshold.
+ * To ensure counters don't remain disabled, they are rebalanced when
+ * the global resource goes above a higher threshold (i.e. some hysteresis
+ * is present to prevent thrashing).
+ */
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * hot-plug CPU notifier support.
+ *
+ * We need a notifier per filesystem as we need to be able to identify
+ * the filesystem to balance the counters out. This is achieved by
+ * having a notifier block embedded in the xfs_mount_t and doing pointer
+ * magic to get the mount pointer from the notifier block address.
+ */
+STATIC int
+xfs_icsb_cpu_notify(
+ struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ xfs_icsb_cnts_t *cntp;
+ xfs_mount_t *mp;
+
+ mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
+ cntp = (xfs_icsb_cnts_t *)
+ per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ /* Easy Case - initialize the area and locks, and
+ * then rebalance when online does everything else for us. */
+ memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
+ break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ xfs_icsb_lock(mp);
+ xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
+ xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
+ xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
+ xfs_icsb_unlock(mp);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ /* Disable all the counters, then fold the dead cpu's
+ * count into the total on the global superblock and
+ * re-enable the counters. */
+ xfs_icsb_lock(mp);
+ spin_lock(&mp->m_sb_lock);
+ xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
+ xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
+ xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
+
+ mp->m_sb.sb_icount += cntp->icsb_icount;
+ mp->m_sb.sb_ifree += cntp->icsb_ifree;
+ mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
+
+ memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
+
+ xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0);
+ xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0);
+ xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0);
+ spin_unlock(&mp->m_sb_lock);
+ xfs_icsb_unlock(mp);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int
+xfs_icsb_init_counters(
+ xfs_mount_t *mp)
+{
+ xfs_icsb_cnts_t *cntp;
+ int i;
+
+ mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
+ if (mp->m_sb_cnts == NULL)
+ return -ENOMEM;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
+ mp->m_icsb_notifier.priority = 0;
+ register_hotcpu_notifier(&mp->m_icsb_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
+
+ for_each_online_cpu(i) {
+ cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+ memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
+ }
+
+ mutex_init(&mp->m_icsb_mutex);
+
+ /*
+ * start with all counters disabled so that the
+ * initial balance kicks us off correctly
+ */
+ mp->m_icsb_counters = -1;
+ return 0;
+}
+
+void
+xfs_icsb_reinit_counters(
+ xfs_mount_t *mp)
+{
+ xfs_icsb_lock(mp);
+ /*
+ * start with all counters disabled so that the
+ * initial balance kicks us off correctly
+ */
+ mp->m_icsb_counters = -1;
+ xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
+ xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
+ xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
+ xfs_icsb_unlock(mp);
+}
+
+void
+xfs_icsb_destroy_counters(
+ xfs_mount_t *mp)
+{
+ if (mp->m_sb_cnts) {
+ unregister_hotcpu_notifier(&mp->m_icsb_notifier);
+ free_percpu(mp->m_sb_cnts);
+ }
+ mutex_destroy(&mp->m_icsb_mutex);
+}
+
+STATIC void
+xfs_icsb_lock_cntr(
+ xfs_icsb_cnts_t *icsbp)
+{
+ while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
+ ndelay(1000);
+ }
+}
+
+STATIC void
+xfs_icsb_unlock_cntr(
+ xfs_icsb_cnts_t *icsbp)
+{
+ clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
+}
+
+
+STATIC void
+xfs_icsb_lock_all_counters(
+ xfs_mount_t *mp)
+{
+ xfs_icsb_cnts_t *cntp;
+ int i;
+
+ for_each_online_cpu(i) {
+ cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+ xfs_icsb_lock_cntr(cntp);
+ }
+}
+
+STATIC void
+xfs_icsb_unlock_all_counters(
+ xfs_mount_t *mp)
+{
+ xfs_icsb_cnts_t *cntp;
+ int i;
+
+ for_each_online_cpu(i) {
+ cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+ xfs_icsb_unlock_cntr(cntp);
+ }
+}
+
+STATIC void
+xfs_icsb_count(
+ xfs_mount_t *mp,
+ xfs_icsb_cnts_t *cnt,
+ int flags)
+{
+ xfs_icsb_cnts_t *cntp;
+ int i;
+
+ memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
+
+ if (!(flags & XFS_ICSB_LAZY_COUNT))
+ xfs_icsb_lock_all_counters(mp);
+
+ for_each_online_cpu(i) {
+ cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+ cnt->icsb_icount += cntp->icsb_icount;
+ cnt->icsb_ifree += cntp->icsb_ifree;
+ cnt->icsb_fdblocks += cntp->icsb_fdblocks;
+ }
+
+ if (!(flags & XFS_ICSB_LAZY_COUNT))
+ xfs_icsb_unlock_all_counters(mp);
+}
+
+STATIC int
+xfs_icsb_counter_disabled(
+ xfs_mount_t *mp,
+ xfs_sb_field_t field)
+{
+ ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+ return test_bit(field, &mp->m_icsb_counters);
+}
+
+STATIC void
+xfs_icsb_disable_counter(
+ xfs_mount_t *mp,
+ xfs_sb_field_t field)
+{
+ xfs_icsb_cnts_t cnt;
+
+ ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+
+ /*
+ * If we are already disabled, then there is nothing to do
+ * here. We check before locking all the counters to avoid
+ * the expensive lock operation when being called in the
+ * slow path and the counter is already disabled. This is
+ * safe because the only time we set or clear this state is under
+ * the m_icsb_mutex.
+ */
+ if (xfs_icsb_counter_disabled(mp, field))
+ return;
+
+ xfs_icsb_lock_all_counters(mp);
+ if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
+ /* drain back to superblock */
+
+ xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT);
+ switch(field) {
+ case XFS_SBS_ICOUNT:
+ mp->m_sb.sb_icount = cnt.icsb_icount;
+ break;
+ case XFS_SBS_IFREE:
+ mp->m_sb.sb_ifree = cnt.icsb_ifree;
+ break;
+ case XFS_SBS_FDBLOCKS:
+ mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
+ break;
+ default:
+ BUG();
+ }
+ }
+
+ xfs_icsb_unlock_all_counters(mp);
+}
+
+STATIC void
+xfs_icsb_enable_counter(
+ xfs_mount_t *mp,
+ xfs_sb_field_t field,
+ uint64_t count,
+ uint64_t resid)
+{
+ xfs_icsb_cnts_t *cntp;
+ int i;
+
+ ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+
+ xfs_icsb_lock_all_counters(mp);
+ for_each_online_cpu(i) {
+ cntp = per_cpu_ptr(mp->m_sb_cnts, i);
+ switch (field) {
+ case XFS_SBS_ICOUNT:
+ cntp->icsb_icount = count + resid;
+ break;
+ case XFS_SBS_IFREE:
+ cntp->icsb_ifree = count + resid;
+ break;
+ case XFS_SBS_FDBLOCKS:
+ cntp->icsb_fdblocks = count + resid;
+ break;
+ default:
+ BUG();
+ break;
+ }
+ resid = 0;
+ }
+ clear_bit(field, &mp->m_icsb_counters);
+ xfs_icsb_unlock_all_counters(mp);
+}
+
+void
+xfs_icsb_sync_counters_locked(
+ xfs_mount_t *mp,
+ int flags)
+{
+ xfs_icsb_cnts_t cnt;
+
+ xfs_icsb_count(mp, &cnt, flags);
+
+ if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
+ mp->m_sb.sb_icount = cnt.icsb_icount;
+ if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
+ mp->m_sb.sb_ifree = cnt.icsb_ifree;
+ if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
+ mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
+}
+
+/*
+ * Accurate update of per-cpu counters to incore superblock
+ */
+void
+xfs_icsb_sync_counters(
+ xfs_mount_t *mp,
+ int flags)
+{
+ spin_lock(&mp->m_sb_lock);
+ xfs_icsb_sync_counters_locked(mp, flags);
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/*
+ * Balance and enable/disable counters as necessary.
+ *
+ * Thresholds for re-enabling counters are somewhat magic. inode counts are
+ * chosen to be the same number as single on disk allocation chunk per CPU, and
+ * free blocks is something far enough zero that we aren't going thrash when we
+ * get near ENOSPC. We also need to supply a minimum we require per cpu to
+ * prevent looping endlessly when xfs_alloc_space asks for more than will
+ * be distributed to a single CPU but each CPU has enough blocks to be
+ * reenabled.
+ *
+ * Note that we can be called when counters are already disabled.
+ * xfs_icsb_disable_counter() optimises the counter locking in this case to
+ * prevent locking every per-cpu counter needlessly.
+ */
+
+#define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64
+#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
+ (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
+STATIC void
+xfs_icsb_balance_counter_locked(
+ xfs_mount_t *mp,
+ xfs_sb_field_t field,
+ int min_per_cpu)
+{
+ uint64_t count, resid;
+ int weight = num_online_cpus();
+ uint64_t min = (uint64_t)min_per_cpu;
+
+ /* disable counter and sync counter */
+ xfs_icsb_disable_counter(mp, field);
+
+ /* update counters - first CPU gets residual*/
+ switch (field) {
+ case XFS_SBS_ICOUNT:
+ count = mp->m_sb.sb_icount;
+ resid = do_div(count, weight);
+ if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
+ return;
+ break;
+ case XFS_SBS_IFREE:
+ count = mp->m_sb.sb_ifree;
+ resid = do_div(count, weight);
+ if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
+ return;
+ break;
+ case XFS_SBS_FDBLOCKS:
+ count = mp->m_sb.sb_fdblocks;
+ resid = do_div(count, weight);
+ if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
+ return;
+ break;
+ default:
+ BUG();
+ count = resid = 0; /* quiet, gcc */
+ break;
+ }
+
+ xfs_icsb_enable_counter(mp, field, count, resid);
+}
+
+STATIC void
+xfs_icsb_balance_counter(
+ xfs_mount_t *mp,
+ xfs_sb_field_t fields,
+ int min_per_cpu)
+{
+ spin_lock(&mp->m_sb_lock);
+ xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
+ spin_unlock(&mp->m_sb_lock);
+}
+
+int
+xfs_icsb_modify_counters(
+ xfs_mount_t *mp,
+ xfs_sb_field_t field,
+ int64_t delta,
+ int rsvd)
+{
+ xfs_icsb_cnts_t *icsbp;
+ long long lcounter; /* long counter for 64 bit fields */
+ int ret = 0;
+
+ might_sleep();
+again:
+ preempt_disable();
+ icsbp = this_cpu_ptr(mp->m_sb_cnts);
+
+ /*
+ * if the counter is disabled, go to slow path
+ */
+ if (unlikely(xfs_icsb_counter_disabled(mp, field)))
+ goto slow_path;
+ xfs_icsb_lock_cntr(icsbp);
+ if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
+ xfs_icsb_unlock_cntr(icsbp);
+ goto slow_path;
+ }
+
+ switch (field) {
+ case XFS_SBS_ICOUNT:
+ lcounter = icsbp->icsb_icount;
+ lcounter += delta;
+ if (unlikely(lcounter < 0))
+ goto balance_counter;
+ icsbp->icsb_icount = lcounter;
+ break;
+
+ case XFS_SBS_IFREE:
+ lcounter = icsbp->icsb_ifree;
+ lcounter += delta;
+ if (unlikely(lcounter < 0))
+ goto balance_counter;
+ icsbp->icsb_ifree = lcounter;
+ break;
+
+ case XFS_SBS_FDBLOCKS:
+ BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
+
+ lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+ lcounter += delta;
+ if (unlikely(lcounter < 0))
+ goto balance_counter;
+ icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
+ break;
+ default:
+ BUG();
+ break;
+ }
+ xfs_icsb_unlock_cntr(icsbp);
+ preempt_enable();
+ return 0;
+
+slow_path:
+ preempt_enable();
+
+ /*
+ * serialise with a mutex so we don't burn lots of cpu on
+ * the superblock lock. We still need to hold the superblock
+ * lock, however, when we modify the global structures.
+ */
+ xfs_icsb_lock(mp);
+
+ /*
+ * Now running atomically.
+ *
+ * If the counter is enabled, someone has beaten us to rebalancing.
+ * Drop the lock and try again in the fast path....
+ */
+ if (!(xfs_icsb_counter_disabled(mp, field))) {
+ xfs_icsb_unlock(mp);
+ goto again;
+ }
+
+ /*
+ * The counter is currently disabled. Because we are
+ * running atomically here, we know a rebalance cannot
+ * be in progress. Hence we can go straight to operating
+ * on the global superblock. We do not call xfs_mod_incore_sb()
+ * here even though we need to get the m_sb_lock. Doing so
+ * will cause us to re-enter this function and deadlock.
+ * Hence we get the m_sb_lock ourselves and then call
+ * xfs_mod_incore_sb_unlocked() as the unlocked path operates
+ * directly on the global counters.
+ */
+ spin_lock(&mp->m_sb_lock);
+ ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+ spin_unlock(&mp->m_sb_lock);
+
+ /*
+ * Now that we've modified the global superblock, we
+ * may be able to re-enable the distributed counters
+ * (e.g. lots of space just got freed). After that
+ * we are done.
+ */
+ if (ret != ENOSPC)
+ xfs_icsb_balance_counter(mp, field, 0);
+ xfs_icsb_unlock(mp);
+ return ret;
+
+balance_counter:
+ xfs_icsb_unlock_cntr(icsbp);
+ preempt_enable();
+
+ /*
+ * We may have multiple threads here if multiple per-cpu
+ * counters run dry at the same time. This will mean we can
+ * do more balances than strictly necessary but it is not
+ * the common slowpath case.
+ */
+ xfs_icsb_lock(mp);
+
+ /*
+ * running atomically.
+ *
+ * This will leave the counter in the correct state for future
+ * accesses. After the rebalance, we simply try again and our retry
+ * will either succeed through the fast path or slow path without
+ * another balance operation being required.
+ */
+ xfs_icsb_balance_counter(mp, field, delta);
+ xfs_icsb_unlock(mp);
+ goto again;
+}
+
+#endif
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
new file mode 100644
index 00000000..3d68bb26
--- /dev/null
+++ b/fs/xfs/xfs_mount.h
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_MOUNT_H__
+#define __XFS_MOUNT_H__
+
+typedef struct xfs_trans_reservations {
+ uint tr_write; /* extent alloc trans */
+ uint tr_itruncate; /* truncate trans */
+ uint tr_rename; /* rename trans */
+ uint tr_link; /* link trans */
+ uint tr_remove; /* unlink trans */
+ uint tr_symlink; /* symlink trans */
+ uint tr_create; /* create trans */
+ uint tr_mkdir; /* mkdir trans */
+ uint tr_ifree; /* inode free trans */
+ uint tr_ichange; /* inode update trans */
+ uint tr_growdata; /* fs data section grow trans */
+ uint tr_swrite; /* sync write inode trans */
+ uint tr_addafork; /* cvt inode to attributed trans */
+ uint tr_writeid; /* write setuid/setgid file */
+ uint tr_attrinval; /* attr fork buffer invalidation */
+ uint tr_attrset; /* set/create an attribute */
+ uint tr_attrrm; /* remove an attribute */
+ uint tr_clearagi; /* clear bad agi unlinked ino bucket */
+ uint tr_growrtalloc; /* grow realtime allocations */
+ uint tr_growrtzero; /* grow realtime zeroing */
+ uint tr_growrtfree; /* grow realtime freeing */
+} xfs_trans_reservations_t;
+
+#ifndef __KERNEL__
+
+#define xfs_daddr_to_agno(mp,d) \
+ ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
+#define xfs_daddr_to_agbno(mp,d) \
+ ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
+
+#else /* __KERNEL__ */
+
+#include "xfs_sync.h"
+
+struct log;
+struct xfs_mount_args;
+struct xfs_inode;
+struct xfs_bmbt_irec;
+struct xfs_bmap_free;
+struct xfs_extdelta;
+struct xfs_swapext;
+struct xfs_mru_cache;
+struct xfs_nameops;
+struct xfs_ail;
+struct xfs_quotainfo;
+
+#ifdef HAVE_PERCPU_SB
+
+/*
+ * Valid per-cpu incore superblock counters. Note that if you add new counters,
+ * you may need to define new counter disabled bit field descriptors as there
+ * are more possible fields in the superblock that can fit in a bitfield on a
+ * 32 bit platform. The XFS_SBS_* values for the current current counters just
+ * fit.
+ */
+typedef struct xfs_icsb_cnts {
+ uint64_t icsb_fdblocks;
+ uint64_t icsb_ifree;
+ uint64_t icsb_icount;
+ unsigned long icsb_flags;
+} xfs_icsb_cnts_t;
+
+#define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */
+
+#define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */
+
+extern int xfs_icsb_init_counters(struct xfs_mount *);
+extern void xfs_icsb_reinit_counters(struct xfs_mount *);
+extern void xfs_icsb_destroy_counters(struct xfs_mount *);
+extern void xfs_icsb_sync_counters(struct xfs_mount *, int);
+extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
+extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
+ int64_t, int);
+
+#else
+#define xfs_icsb_init_counters(mp) (0)
+#define xfs_icsb_destroy_counters(mp) do { } while (0)
+#define xfs_icsb_reinit_counters(mp) do { } while (0)
+#define xfs_icsb_sync_counters(mp, flags) do { } while (0)
+#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
+#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
+ xfs_mod_incore_sb(mp, field, delta, rsvd)
+#endif
+
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+ XFS_LOWSP_1_PCNT = 0,
+ XFS_LOWSP_2_PCNT,
+ XFS_LOWSP_3_PCNT,
+ XFS_LOWSP_4_PCNT,
+ XFS_LOWSP_5_PCNT,
+ XFS_LOWSP_MAX,
+};
+
+typedef struct xfs_mount {
+ struct super_block *m_super;
+ xfs_tid_t m_tid; /* next unused tid for fs */
+ struct xfs_ail *m_ail; /* fs active log item list */
+ xfs_sb_t m_sb; /* copy of fs superblock */
+ spinlock_t m_sb_lock; /* sb counter lock */
+ struct xfs_buf *m_sb_bp; /* buffer for superblock */
+ char *m_fsname; /* filesystem name */
+ int m_fsname_len; /* strlen of fs name */
+ char *m_rtname; /* realtime device name */
+ char *m_logname; /* external log device name */
+ int m_bsize; /* fs logical block size */
+ xfs_agnumber_t m_agfrotor; /* last ag where space found */
+ xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
+ spinlock_t m_agirotor_lock;/* .. and lock protecting it */
+ xfs_agnumber_t m_maxagi; /* highest inode alloc group */
+ uint m_readio_log; /* min read size log bytes */
+ uint m_readio_blocks; /* min read size blocks */
+ uint m_writeio_log; /* min write size log bytes */
+ uint m_writeio_blocks; /* min write size blocks */
+ struct log *m_log; /* log specific stuff */
+ int m_logbufs; /* number of log buffers */
+ int m_logbsize; /* size of each log buffer */
+ uint m_rsumlevels; /* rt summary levels */
+ uint m_rsumsize; /* size of rt summary, bytes */
+ struct xfs_inode *m_rbmip; /* pointer to bitmap inode */
+ struct xfs_inode *m_rsumip; /* pointer to summary inode */
+ struct xfs_inode *m_rootip; /* pointer to root directory */
+ struct xfs_quotainfo *m_quotainfo; /* disk quota information */
+ xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
+ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
+ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
+ __uint8_t m_blkbit_log; /* blocklog + NBBY */
+ __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
+ __uint8_t m_agno_log; /* log #ag's */
+ __uint8_t m_agino_log; /* #bits for agino in inum */
+ __uint16_t m_inode_cluster_size;/* min inode buf size */
+ uint m_blockmask; /* sb_blocksize-1 */
+ uint m_blockwsize; /* sb_blocksize in words */
+ uint m_blockwmask; /* blockwsize-1 */
+ uint m_alloc_mxr[2]; /* max alloc btree records */
+ uint m_alloc_mnr[2]; /* min alloc btree records */
+ uint m_bmap_dmxr[2]; /* max bmap btree records */
+ uint m_bmap_dmnr[2]; /* min bmap btree records */
+ uint m_inobt_mxr[2]; /* max inobt btree records */
+ uint m_inobt_mnr[2]; /* min inobt btree records */
+ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
+ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
+ uint m_in_maxlevels; /* max inobt btree levels. */
+ struct radix_tree_root m_perag_tree; /* per-ag accounting info */
+ spinlock_t m_perag_lock; /* lock for m_perag_tree */
+ struct mutex m_growlock; /* growfs mutex */
+ int m_fixedfsid[2]; /* unchanged for life of FS */
+ uint m_dmevmask; /* DMI events for this FS */
+ __uint64_t m_flags; /* global mount flags */
+ uint m_dir_node_ents; /* #entries in a dir danode */
+ uint m_attr_node_ents; /* #entries in attr danode */
+ int m_ialloc_inos; /* inodes in inode allocation */
+ int m_ialloc_blks; /* blocks in inode allocation */
+ int m_inoalign_mask;/* mask sb_inoalignmt if used */
+ uint m_qflags; /* quota status flags */
+ xfs_trans_reservations_t m_reservations;/* precomputed res values */
+ __uint64_t m_maxicount; /* maximum inode count */
+ __uint64_t m_maxioffset; /* maximum inode offset */
+ __uint64_t m_resblks; /* total reserved blocks */
+ __uint64_t m_resblks_avail;/* available reserved blocks */
+ __uint64_t m_resblks_save; /* reserved blks @ remount,ro */
+ int m_dalign; /* stripe unit */
+ int m_swidth; /* stripe width */
+ int m_sinoalign; /* stripe unit inode alignment */
+ int m_attr_magicpct;/* 37% of the blocksize */
+ int m_dir_magicpct; /* 37% of the dir blocksize */
+ __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
+ const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
+ int m_dirblksize; /* directory block sz--bytes */
+ int m_dirblkfsbs; /* directory block sz--fsbs */
+ xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */
+ xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
+ xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
+ uint m_chsize; /* size of next field */
+ struct xfs_chash *m_chash; /* fs private inode per-cluster
+ * hash table */
+ atomic_t m_active_trans; /* number trans frozen */
+#ifdef HAVE_PERCPU_SB
+ xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
+ unsigned long m_icsb_counters; /* disabled per-cpu counters */
+ struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */
+ struct mutex m_icsb_mutex; /* balancer sync lock */
+#endif
+ struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
+ struct delayed_work m_sync_work; /* background sync work */
+ struct delayed_work m_reclaim_work; /* background inode reclaim */
+ struct work_struct m_flush_work; /* background inode flush */
+ __int64_t m_update_flags; /* sb flags we need to update
+ on the next remount,rw */
+ struct shrinker m_inode_shrink; /* inode reclaim shrinker */
+ int64_t m_low_space[XFS_LOWSP_MAX];
+ /* low free space thresholds */
+} xfs_mount_t;
+
+/*
+ * Flags for m_flags.
+ */
+#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
+ must be synchronous except
+ for space allocations */
+#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
+#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
+#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
+ operations, typically for
+ disk errors in metadata */
+#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
+#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
+ user */
+#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
+ allocations */
+#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
+#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
+#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
+#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
+#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above
+ * 32 bits in size */
+#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */
+#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */
+#define XFS_MOUNT_BARRIER (1ULL << 17)
+#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/
+#define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width
+ * allocation */
+#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */
+#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
+#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
+ * I/O size in stat() */
+#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
+ allocator */
+#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
+
+
+/*
+ * Default minimum read and write sizes.
+ */
+#define XFS_READIO_LOG_LARGE 16
+#define XFS_WRITEIO_LOG_LARGE 16
+
+/*
+ * Max and min values for mount-option defined I/O
+ * preallocation sizes.
+ */
+#define XFS_MAX_IO_LOG 30 /* 1G */
+#define XFS_MIN_IO_LOG PAGE_SHIFT
+
+/*
+ * Synchronous read and write sizes. This should be
+ * better for NFSv2 wsync filesystems.
+ */
+#define XFS_WSYNC_READIO_LOG 15 /* 32k */
+#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */
+
+/*
+ * Allow large block sizes to be reported to userspace programs if the
+ * "largeio" mount option is used.
+ *
+ * If compatibility mode is specified, simply return the basic unit of caching
+ * so that we don't get inefficient read/modify/write I/O from user apps.
+ * Otherwise....
+ *
+ * If the underlying volume is a stripe, then return the stripe width in bytes
+ * as the recommended I/O size. It is not a stripe and we've set a default
+ * buffered I/O size, return that, otherwise return the compat default.
+ */
+static inline unsigned long
+xfs_preferred_iosize(xfs_mount_t *mp)
+{
+ if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
+ return PAGE_CACHE_SIZE;
+ return (mp->m_swidth ?
+ (mp->m_swidth << mp->m_sb.sb_blocklog) :
+ ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
+ (1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) :
+ PAGE_CACHE_SIZE));
+}
+
+#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset)
+
+#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
+ ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN)
+#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
+void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
+ int lnnum);
+#define xfs_force_shutdown(m,f) \
+ xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
+
+#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
+#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */
+#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */
+
+#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen)
+#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l))
+
+/*
+ * Flags for xfs_mountfs
+ */
+#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */
+
+static inline xfs_agnumber_t
+xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
+{
+ xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+ do_div(ld, mp->m_sb.sb_agblocks);
+ return (xfs_agnumber_t) ld;
+}
+
+static inline xfs_agblock_t
+xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
+{
+ xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+ return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * perag get/put wrappers for ref counting
+ */
+struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int tag);
+void xfs_perag_put(struct xfs_perag *pag);
+
+/*
+ * Per-cpu superblock locking functions
+ */
+#ifdef HAVE_PERCPU_SB
+static inline void
+xfs_icsb_lock(xfs_mount_t *mp)
+{
+ mutex_lock(&mp->m_icsb_mutex);
+}
+
+static inline void
+xfs_icsb_unlock(xfs_mount_t *mp)
+{
+ mutex_unlock(&mp->m_icsb_mutex);
+}
+#else
+#define xfs_icsb_lock(mp)
+#define xfs_icsb_unlock(mp)
+#endif
+
+/*
+ * This structure is for use by the xfs_mod_incore_sb_batch() routine.
+ * xfs_growfs can specify a few fields which are more than int limit
+ */
+typedef struct xfs_mod_sb {
+ xfs_sb_field_t msb_field; /* Field to modify, see below */
+ int64_t msb_delta; /* Change to make to specified field */
+} xfs_mod_sb_t;
+
+extern int xfs_log_sbcount(xfs_mount_t *, uint);
+extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
+extern int xfs_mountfs(xfs_mount_t *mp);
+
+extern void xfs_unmountfs(xfs_mount_t *);
+extern int xfs_unmountfs_writesb(xfs_mount_t *);
+extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
+extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
+ uint, int);
+extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
+extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
+extern int xfs_readsb(xfs_mount_t *, int);
+extern void xfs_freesb(xfs_mount_t *);
+extern int xfs_fs_writable(xfs_mount_t *);
+extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
+
+extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
+
+extern void xfs_set_low_space_thresholds(struct xfs_mount *);
+
+#endif /* __KERNEL__ */
+
+extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
+ xfs_agnumber_t *);
+extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+
+#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
new file mode 100644
index 00000000..4aff5639
--- /dev/null
+++ b/fs/xfs/xfs_mru_cache.c
@@ -0,0 +1,576 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_mru_cache.h"
+
+/*
+ * The MRU Cache data structure consists of a data store, an array of lists and
+ * a lock to protect its internal state. At initialisation time, the client
+ * supplies an element lifetime in milliseconds and a group count, as well as a
+ * function pointer to call when deleting elements. A data structure for
+ * queueing up work in the form of timed callbacks is also included.
+ *
+ * The group count controls how many lists are created, and thereby how finely
+ * the elements are grouped in time. When reaping occurs, all the elements in
+ * all the lists whose time has expired are deleted.
+ *
+ * To give an example of how this works in practice, consider a client that
+ * initialises an MRU Cache with a lifetime of ten seconds and a group count of
+ * five. Five internal lists will be created, each representing a two second
+ * period in time. When the first element is added, time zero for the data
+ * structure is initialised to the current time.
+ *
+ * All the elements added in the first two seconds are appended to the first
+ * list. Elements added in the third second go into the second list, and so on.
+ * If an element is accessed at any point, it is removed from its list and
+ * inserted at the head of the current most-recently-used list.
+ *
+ * The reaper function will have nothing to do until at least twelve seconds
+ * have elapsed since the first element was added. The reason for this is that
+ * if it were called at t=11s, there could be elements in the first list that
+ * have only been inactive for nine seconds, so it still does nothing. If it is
+ * called anywhere between t=12 and t=14 seconds, it will delete all the
+ * elements that remain in the first list. It's therefore possible for elements
+ * to remain in the data store even after they've been inactive for up to
+ * (t + t/g) seconds, where t is the inactive element lifetime and g is the
+ * number of groups.
+ *
+ * The above example assumes that the reaper function gets called at least once
+ * every (t/g) seconds. If it is called less frequently, unused elements will
+ * accumulate in the reap list until the reaper function is eventually called.
+ * The current implementation uses work queue callbacks to carefully time the
+ * reaper function calls, so this should happen rarely, if at all.
+ *
+ * From a design perspective, the primary reason for the choice of a list array
+ * representing discrete time intervals is that it's only practical to reap
+ * expired elements in groups of some appreciable size. This automatically
+ * introduces a granularity to element lifetimes, so there's no point storing an
+ * individual timeout with each element that specifies a more precise reap time.
+ * The bonus is a saving of sizeof(long) bytes of memory per element stored.
+ *
+ * The elements could have been stored in just one list, but an array of
+ * counters or pointers would need to be maintained to allow them to be divided
+ * up into discrete time groups. More critically, the process of touching or
+ * removing an element would involve walking large portions of the entire list,
+ * which would have a detrimental effect on performance. The additional memory
+ * requirement for the array of list heads is minimal.
+ *
+ * When an element is touched or deleted, it needs to be removed from its
+ * current list. Doubly linked lists are used to make the list maintenance
+ * portion of these operations O(1). Since reaper timing can be imprecise,
+ * inserts and lookups can occur when there are no free lists available. When
+ * this happens, all the elements on the LRU list need to be migrated to the end
+ * of the reap list. To keep the list maintenance portion of these operations
+ * O(1) also, list tails need to be accessible without walking the entire list.
+ * This is the reason why doubly linked list heads are used.
+ */
+
+/*
+ * An MRU Cache is a dynamic data structure that stores its elements in a way
+ * that allows efficient lookups, but also groups them into discrete time
+ * intervals based on insertion time. This allows elements to be efficiently
+ * and automatically reaped after a fixed period of inactivity.
+ *
+ * When a client data pointer is stored in the MRU Cache it needs to be added to
+ * both the data store and to one of the lists. It must also be possible to
+ * access each of these entries via the other, i.e. to:
+ *
+ * a) Walk a list, removing the corresponding data store entry for each item.
+ * b) Look up a data store entry, then access its list entry directly.
+ *
+ * To achieve both of these goals, each entry must contain both a list entry and
+ * a key, in addition to the user's data pointer. Note that it's not a good
+ * idea to have the client embed one of these structures at the top of their own
+ * data structure, because inserting the same item more than once would most
+ * likely result in a loop in one of the lists. That's a sure-fire recipe for
+ * an infinite loop in the code.
+ */
+typedef struct xfs_mru_cache_elem
+{
+ struct list_head list_node;
+ unsigned long key;
+ void *value;
+} xfs_mru_cache_elem_t;
+
+static kmem_zone_t *xfs_mru_elem_zone;
+static struct workqueue_struct *xfs_mru_reap_wq;
+
+/*
+ * When inserting, destroying or reaping, it's first necessary to update the
+ * lists relative to a particular time. In the case of destroying, that time
+ * will be well in the future to ensure that all items are moved to the reap
+ * list. In all other cases though, the time will be the current time.
+ *
+ * This function enters a loop, moving the contents of the LRU list to the reap
+ * list again and again until either a) the lists are all empty, or b) time zero
+ * has been advanced sufficiently to be within the immediate element lifetime.
+ *
+ * Case a) above is detected by counting how many groups are migrated and
+ * stopping when they've all been moved. Case b) is detected by monitoring the
+ * time_zero field, which is updated as each group is migrated.
+ *
+ * The return value is the earliest time that more migration could be needed, or
+ * zero if there's no need to schedule more work because the lists are empty.
+ */
+STATIC unsigned long
+_xfs_mru_cache_migrate(
+ xfs_mru_cache_t *mru,
+ unsigned long now)
+{
+ unsigned int grp;
+ unsigned int migrated = 0;
+ struct list_head *lru_list;
+
+ /* Nothing to do if the data store is empty. */
+ if (!mru->time_zero)
+ return 0;
+
+ /* While time zero is older than the time spanned by all the lists. */
+ while (mru->time_zero <= now - mru->grp_count * mru->grp_time) {
+
+ /*
+ * If the LRU list isn't empty, migrate its elements to the tail
+ * of the reap list.
+ */
+ lru_list = mru->lists + mru->lru_grp;
+ if (!list_empty(lru_list))
+ list_splice_init(lru_list, mru->reap_list.prev);
+
+ /*
+ * Advance the LRU group number, freeing the old LRU list to
+ * become the new MRU list; advance time zero accordingly.
+ */
+ mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count;
+ mru->time_zero += mru->grp_time;
+
+ /*
+ * If reaping is so far behind that all the elements on all the
+ * lists have been migrated to the reap list, it's now empty.
+ */
+ if (++migrated == mru->grp_count) {
+ mru->lru_grp = 0;
+ mru->time_zero = 0;
+ return 0;
+ }
+ }
+
+ /* Find the first non-empty list from the LRU end. */
+ for (grp = 0; grp < mru->grp_count; grp++) {
+
+ /* Check the grp'th list from the LRU end. */
+ lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count);
+ if (!list_empty(lru_list))
+ return mru->time_zero +
+ (mru->grp_count + grp) * mru->grp_time;
+ }
+
+ /* All the lists must be empty. */
+ mru->lru_grp = 0;
+ mru->time_zero = 0;
+ return 0;
+}
+
+/*
+ * When inserting or doing a lookup, an element needs to be inserted into the
+ * MRU list. The lists must be migrated first to ensure that they're
+ * up-to-date, otherwise the new element could be given a shorter lifetime in
+ * the cache than it should.
+ */
+STATIC void
+_xfs_mru_cache_list_insert(
+ xfs_mru_cache_t *mru,
+ xfs_mru_cache_elem_t *elem)
+{
+ unsigned int grp = 0;
+ unsigned long now = jiffies;
+
+ /*
+ * If the data store is empty, initialise time zero, leave grp set to
+ * zero and start the work queue timer if necessary. Otherwise, set grp
+ * to the number of group times that have elapsed since time zero.
+ */
+ if (!_xfs_mru_cache_migrate(mru, now)) {
+ mru->time_zero = now;
+ if (!mru->queued) {
+ mru->queued = 1;
+ queue_delayed_work(xfs_mru_reap_wq, &mru->work,
+ mru->grp_count * mru->grp_time);
+ }
+ } else {
+ grp = (now - mru->time_zero) / mru->grp_time;
+ grp = (mru->lru_grp + grp) % mru->grp_count;
+ }
+
+ /* Insert the element at the tail of the corresponding list. */
+ list_add_tail(&elem->list_node, mru->lists + grp);
+}
+
+/*
+ * When destroying or reaping, all the elements that were migrated to the reap
+ * list need to be deleted. For each element this involves removing it from the
+ * data store, removing it from the reap list, calling the client's free
+ * function and deleting the element from the element zone.
+ *
+ * We get called holding the mru->lock, which we drop and then reacquire.
+ * Sparse need special help with this to tell it we know what we are doing.
+ */
+STATIC void
+_xfs_mru_cache_clear_reap_list(
+ xfs_mru_cache_t *mru) __releases(mru->lock) __acquires(mru->lock)
+
+{
+ xfs_mru_cache_elem_t *elem, *next;
+ struct list_head tmp;
+
+ INIT_LIST_HEAD(&tmp);
+ list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) {
+
+ /* Remove the element from the data store. */
+ radix_tree_delete(&mru->store, elem->key);
+
+ /*
+ * remove to temp list so it can be freed without
+ * needing to hold the lock
+ */
+ list_move(&elem->list_node, &tmp);
+ }
+ spin_unlock(&mru->lock);
+
+ list_for_each_entry_safe(elem, next, &tmp, list_node) {
+
+ /* Remove the element from the reap list. */
+ list_del_init(&elem->list_node);
+
+ /* Call the client's free function with the key and value pointer. */
+ mru->free_func(elem->key, elem->value);
+
+ /* Free the element structure. */
+ kmem_zone_free(xfs_mru_elem_zone, elem);
+ }
+
+ spin_lock(&mru->lock);
+}
+
+/*
+ * We fire the reap timer every group expiry interval so
+ * we always have a reaper ready to run. This makes shutdown
+ * and flushing of the reaper easy to do. Hence we need to
+ * keep when the next reap must occur so we can determine
+ * at each interval whether there is anything we need to do.
+ */
+STATIC void
+_xfs_mru_cache_reap(
+ struct work_struct *work)
+{
+ xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work);
+ unsigned long now, next;
+
+ ASSERT(mru && mru->lists);
+ if (!mru || !mru->lists)
+ return;
+
+ spin_lock(&mru->lock);
+ next = _xfs_mru_cache_migrate(mru, jiffies);
+ _xfs_mru_cache_clear_reap_list(mru);
+
+ mru->queued = next;
+ if ((mru->queued > 0)) {
+ now = jiffies;
+ if (next <= now)
+ next = 0;
+ else
+ next -= now;
+ queue_delayed_work(xfs_mru_reap_wq, &mru->work, next);
+ }
+
+ spin_unlock(&mru->lock);
+}
+
+int
+xfs_mru_cache_init(void)
+{
+ xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
+ "xfs_mru_cache_elem");
+ if (!xfs_mru_elem_zone)
+ goto out;
+
+ xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
+ if (!xfs_mru_reap_wq)
+ goto out_destroy_mru_elem_zone;
+
+ return 0;
+
+ out_destroy_mru_elem_zone:
+ kmem_zone_destroy(xfs_mru_elem_zone);
+ out:
+ return -ENOMEM;
+}
+
+void
+xfs_mru_cache_uninit(void)
+{
+ destroy_workqueue(xfs_mru_reap_wq);
+ kmem_zone_destroy(xfs_mru_elem_zone);
+}
+
+/*
+ * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create()
+ * with the address of the pointer, a lifetime value in milliseconds, a group
+ * count and a free function to use when deleting elements. This function
+ * returns 0 if the initialisation was successful.
+ */
+int
+xfs_mru_cache_create(
+ xfs_mru_cache_t **mrup,
+ unsigned int lifetime_ms,
+ unsigned int grp_count,
+ xfs_mru_cache_free_func_t free_func)
+{
+ xfs_mru_cache_t *mru = NULL;
+ int err = 0, grp;
+ unsigned int grp_time;
+
+ if (mrup)
+ *mrup = NULL;
+
+ if (!mrup || !grp_count || !lifetime_ms || !free_func)
+ return EINVAL;
+
+ if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
+ return EINVAL;
+
+ if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
+ return ENOMEM;
+
+ /* An extra list is needed to avoid reaping up to a grp_time early. */
+ mru->grp_count = grp_count + 1;
+ mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
+
+ if (!mru->lists) {
+ err = ENOMEM;
+ goto exit;
+ }
+
+ for (grp = 0; grp < mru->grp_count; grp++)
+ INIT_LIST_HEAD(mru->lists + grp);
+
+ /*
+ * We use GFP_KERNEL radix tree preload and do inserts under a
+ * spinlock so GFP_ATOMIC is appropriate for the radix tree itself.
+ */
+ INIT_RADIX_TREE(&mru->store, GFP_ATOMIC);
+ INIT_LIST_HEAD(&mru->reap_list);
+ spin_lock_init(&mru->lock);
+ INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap);
+
+ mru->grp_time = grp_time;
+ mru->free_func = free_func;
+
+ *mrup = mru;
+
+exit:
+ if (err && mru && mru->lists)
+ kmem_free(mru->lists);
+ if (err && mru)
+ kmem_free(mru);
+
+ return err;
+}
+
+/*
+ * Call xfs_mru_cache_flush() to flush out all cached entries, calling their
+ * free functions as they're deleted. When this function returns, the caller is
+ * guaranteed that all the free functions for all the elements have finished
+ * executing and the reaper is not running.
+ */
+static void
+xfs_mru_cache_flush(
+ xfs_mru_cache_t *mru)
+{
+ if (!mru || !mru->lists)
+ return;
+
+ spin_lock(&mru->lock);
+ if (mru->queued) {
+ spin_unlock(&mru->lock);
+ cancel_delayed_work_sync(&mru->work);
+ spin_lock(&mru->lock);
+ }
+
+ _xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time);
+ _xfs_mru_cache_clear_reap_list(mru);
+
+ spin_unlock(&mru->lock);
+}
+
+void
+xfs_mru_cache_destroy(
+ xfs_mru_cache_t *mru)
+{
+ if (!mru || !mru->lists)
+ return;
+
+ xfs_mru_cache_flush(mru);
+
+ kmem_free(mru->lists);
+ kmem_free(mru);
+}
+
+/*
+ * To insert an element, call xfs_mru_cache_insert() with the data store, the
+ * element's key and the client data pointer. This function returns 0 on
+ * success or ENOMEM if memory for the data element couldn't be allocated.
+ */
+int
+xfs_mru_cache_insert(
+ xfs_mru_cache_t *mru,
+ unsigned long key,
+ void *value)
+{
+ xfs_mru_cache_elem_t *elem;
+
+ ASSERT(mru && mru->lists);
+ if (!mru || !mru->lists)
+ return EINVAL;
+
+ elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP);
+ if (!elem)
+ return ENOMEM;
+
+ if (radix_tree_preload(GFP_KERNEL)) {
+ kmem_zone_free(xfs_mru_elem_zone, elem);
+ return ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&elem->list_node);
+ elem->key = key;
+ elem->value = value;
+
+ spin_lock(&mru->lock);
+
+ radix_tree_insert(&mru->store, key, elem);
+ radix_tree_preload_end();
+ _xfs_mru_cache_list_insert(mru, elem);
+
+ spin_unlock(&mru->lock);
+
+ return 0;
+}
+
+/*
+ * To remove an element without calling the free function, call
+ * xfs_mru_cache_remove() with the data store and the element's key. On success
+ * the client data pointer for the removed element is returned, otherwise this
+ * function will return a NULL pointer.
+ */
+void *
+xfs_mru_cache_remove(
+ xfs_mru_cache_t *mru,
+ unsigned long key)
+{
+ xfs_mru_cache_elem_t *elem;
+ void *value = NULL;
+
+ ASSERT(mru && mru->lists);
+ if (!mru || !mru->lists)
+ return NULL;
+
+ spin_lock(&mru->lock);
+ elem = radix_tree_delete(&mru->store, key);
+ if (elem) {
+ value = elem->value;
+ list_del(&elem->list_node);
+ }
+
+ spin_unlock(&mru->lock);
+
+ if (elem)
+ kmem_zone_free(xfs_mru_elem_zone, elem);
+
+ return value;
+}
+
+/*
+ * To remove and element and call the free function, call xfs_mru_cache_delete()
+ * with the data store and the element's key.
+ */
+void
+xfs_mru_cache_delete(
+ xfs_mru_cache_t *mru,
+ unsigned long key)
+{
+ void *value = xfs_mru_cache_remove(mru, key);
+
+ if (value)
+ mru->free_func(key, value);
+}
+
+/*
+ * To look up an element using its key, call xfs_mru_cache_lookup() with the
+ * data store and the element's key. If found, the element will be moved to the
+ * head of the MRU list to indicate that it's been touched.
+ *
+ * The internal data structures are protected by a spinlock that is STILL HELD
+ * when this function returns. Call xfs_mru_cache_done() to release it. Note
+ * that it is not safe to call any function that might sleep in the interim.
+ *
+ * The implementation could have used reference counting to avoid this
+ * restriction, but since most clients simply want to get, set or test a member
+ * of the returned data structure, the extra per-element memory isn't warranted.
+ *
+ * If the element isn't found, this function returns NULL and the spinlock is
+ * released. xfs_mru_cache_done() should NOT be called when this occurs.
+ *
+ * Because sparse isn't smart enough to know about conditional lock return
+ * status, we need to help it get it right by annotating the path that does
+ * not release the lock.
+ */
+void *
+xfs_mru_cache_lookup(
+ xfs_mru_cache_t *mru,
+ unsigned long key)
+{
+ xfs_mru_cache_elem_t *elem;
+
+ ASSERT(mru && mru->lists);
+ if (!mru || !mru->lists)
+ return NULL;
+
+ spin_lock(&mru->lock);
+ elem = radix_tree_lookup(&mru->store, key);
+ if (elem) {
+ list_del(&elem->list_node);
+ _xfs_mru_cache_list_insert(mru, elem);
+ __release(mru_lock); /* help sparse not be stupid */
+ } else
+ spin_unlock(&mru->lock);
+
+ return elem ? elem->value : NULL;
+}
+
+/*
+ * To release the internal data structure spinlock after having performed an
+ * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
+ * with the data store pointer.
+ */
+void
+xfs_mru_cache_done(
+ xfs_mru_cache_t *mru) __releases(mru->lock)
+{
+ spin_unlock(&mru->lock);
+}
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
new file mode 100644
index 00000000..36dd3ec8
--- /dev/null
+++ b/fs/xfs/xfs_mru_cache.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_MRU_CACHE_H__
+#define __XFS_MRU_CACHE_H__
+
+
+/* Function pointer type for callback to free a client's data pointer. */
+typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*);
+
+typedef struct xfs_mru_cache
+{
+ struct radix_tree_root store; /* Core storage data structure. */
+ struct list_head *lists; /* Array of lists, one per grp. */
+ struct list_head reap_list; /* Elements overdue for reaping. */
+ spinlock_t lock; /* Lock to protect this struct. */
+ unsigned int grp_count; /* Number of discrete groups. */
+ unsigned int grp_time; /* Time period spanned by grps. */
+ unsigned int lru_grp; /* Group containing time zero. */
+ unsigned long time_zero; /* Time first element was added. */
+ xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
+ struct delayed_work work; /* Workqueue data for reaping. */
+ unsigned int queued; /* work has been queued */
+} xfs_mru_cache_t;
+
+int xfs_mru_cache_init(void);
+void xfs_mru_cache_uninit(void);
+int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
+ unsigned int grp_count,
+ xfs_mru_cache_free_func_t free_func);
+void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
+int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
+ void *value);
+void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
+void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
+void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
+void xfs_mru_cache_done(struct xfs_mru_cache *mru);
+
+#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
new file mode 100644
index 00000000..a595f295
--- /dev/null
+++ b/fs/xfs/xfs_quota.h
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_QUOTA_H__
+#define __XFS_QUOTA_H__
+
+struct xfs_trans;
+
+/*
+ * The ondisk form of a dquot structure.
+ */
+#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
+#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
+
+/*
+ * uid_t and gid_t are hard-coded to 32 bits in the inode.
+ * Hence, an 'id' in a dquot is 32 bits..
+ */
+typedef __uint32_t xfs_dqid_t;
+
+/*
+ * Even though users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef __uint64_t xfs_qcnt_t;
+typedef __uint16_t xfs_qwarncnt_t;
+
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct xfs_disk_dquot {
+ __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
+ __u8 d_version; /* dquot version */
+ __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
+ __be32 d_id; /* user,project,group id */
+ __be64 d_blk_hardlimit;/* absolute limit on disk blks */
+ __be64 d_blk_softlimit;/* preferred limit on disk blks */
+ __be64 d_ino_hardlimit;/* maximum # allocated inodes */
+ __be64 d_ino_softlimit;/* preferred inode limit */
+ __be64 d_bcount; /* disk blocks owned by the user */
+ __be64 d_icount; /* inodes owned by the user */
+ __be32 d_itimer; /* zero if within inode limits if not,
+ this is when we refuse service */
+ __be32 d_btimer; /* similar to above; for disk blocks */
+ __be16 d_iwarns; /* warnings issued wrt num inodes */
+ __be16 d_bwarns; /* warnings issued wrt disk blocks */
+ __be32 d_pad0; /* 64 bit align */
+ __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */
+ __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */
+ __be64 d_rtbcount; /* realtime blocks owned */
+ __be32 d_rtbtimer; /* similar to above; for RT disk blocks */
+ __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
+ __be16 d_pad;
+} xfs_disk_dquot_t;
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+ xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
+ char dd_fill[32]; /* filling for posterity */
+} xfs_dqblk_t;
+
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQ_USER 0x0001 /* a user quota */
+#define XFS_DQ_PROJ 0x0002 /* project quota */
+#define XFS_DQ_GROUP 0x0004 /* a group quota */
+#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
+#define XFS_DQ_WANT 0x0010 /* for lookup/reclaim race */
+#define XFS_DQ_INACTIVE 0x0020 /* dq off mplist & hashlist */
+
+#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
+
+#define XFS_DQ_FLAGS \
+ { XFS_DQ_USER, "USER" }, \
+ { XFS_DQ_PROJ, "PROJ" }, \
+ { XFS_DQ_GROUP, "GROUP" }, \
+ { XFS_DQ_DIRTY, "DIRTY" }, \
+ { XFS_DQ_WANT, "WANT" }, \
+ { XFS_DQ_INACTIVE, "INACTIVE" }
+
+/*
+ * In the worst case, when both user and group quotas are on,
+ * we can have a max of three dquots changing in a single transaction.
+ */
+#define XFS_DQUOT_LOGRES(mp) (sizeof(xfs_disk_dquot_t) * 3)
+
+
+/*
+ * These are the structures used to lay out dquots and quotaoff
+ * records on the log. Quite similar to those of inodes.
+ */
+
+/*
+ * log format struct for dquots.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+ __uint16_t qlf_type; /* dquot log item type */
+ __uint16_t qlf_size; /* size of this item */
+ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
+ __int64_t qlf_blkno; /* blkno of dquot buffer */
+ __int32_t qlf_len; /* len of dquot buffer */
+ __uint32_t qlf_boffset; /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+ unsigned short qf_type; /* quotaoff log item type */
+ unsigned short qf_size; /* size of this item */
+ unsigned int qf_flags; /* USR and/or GRP */
+ char qf_pad[12]; /* padding for future */
+} xfs_qoff_logformat_t;
+
+
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */
+#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */
+#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */
+#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */
+#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */
+#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
+
+/*
+ * Quota Accounting/Enforcement flags
+ */
+#define XFS_ALL_QUOTA_ACCT \
+ (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
+
+#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD)
+
+/*
+ * Incore only flags for quotaoff - these bits get cleared when quota(s)
+ * are in the process of getting turned off. These flags are in m_qflags but
+ * never in sb_qflags.
+ */
+#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */
+
+/*
+ * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
+ * quota will be not be switched off as long as that inode lock is held.
+ */
+#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
+ XFS_GQUOTA_ACTIVE | \
+ XFS_PQUOTA_ACTIVE))
+#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
+ XFS_PQUOTA_ACTIVE))
+#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
+#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
+#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
+
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
+#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
+#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
+#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */
+#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
+#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
+#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
+#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
+#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
+
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS 0x0010000
+#define XFS_QMOPT_RES_RTBLKS 0x0020000
+#define XFS_QMOPT_BCOUNT 0x0040000
+#define XFS_QMOPT_ICOUNT 0x0080000
+#define XFS_QMOPT_RTBCOUNT 0x0100000
+#define XFS_QMOPT_DELBCOUNT 0x0200000
+#define XFS_QMOPT_DELRTBCOUNT 0x0400000
+#define XFS_QMOPT_RES_INOS 0x0800000
+
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT 0x1000000
+
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+
+
+#define XFS_QMOPT_QUOTALL \
+ (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
+#ifdef __KERNEL__
+/*
+ * This check is done typically without holding the inode lock;
+ * that may seem racy, but it is harmless in the context that it is used.
+ * The inode cannot go inactive as long a reference is kept, and
+ * therefore if dquot(s) were attached, they'll stay consistent.
+ * If, for example, the ownership of the inode changes while
+ * we didn't have the inode locked, the appropriate dquot(s) will be
+ * attached atomically.
+ */
+#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\
+ (ip)->i_udquot == NULL) || \
+ (XFS_IS_OQUOTA_ON(mp) && \
+ (ip)->i_gdquot == NULL))
+
+#define XFS_QM_NEED_QUOTACHECK(mp) \
+ ((XFS_IS_UQUOTA_ON(mp) && \
+ (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
+ (XFS_IS_GQUOTA_ON(mp) && \
+ ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
+ (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
+ (XFS_IS_PQUOTA_ON(mp) && \
+ ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
+ (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
+
+#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+ XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
+ XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+
+#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+ XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+ XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+
+#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+ XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
+ XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
+ XFS_GQUOTA_ACCT)
+
+
+/*
+ * The structure kept inside the xfs_trans_t keep track of dquot changes
+ * within a transaction and apply them later.
+ */
+typedef struct xfs_dqtrx {
+ struct xfs_dquot *qt_dquot; /* the dquot this refers to */
+ ulong qt_blk_res; /* blks reserved on a dquot */
+ ulong qt_blk_res_used; /* blks used from the reservation */
+ ulong qt_ino_res; /* inode reserved on a dquot */
+ ulong qt_ino_res_used; /* inodes used from the reservation */
+ long qt_bcount_delta; /* dquot blk count changes */
+ long qt_delbcnt_delta; /* delayed dquot blk count changes */
+ long qt_icount_delta; /* dquot inode count changes */
+ ulong qt_rtblk_res; /* # blks reserved on a dquot */
+ ulong qt_rtblk_res_used;/* # blks used from reservation */
+ long qt_rtbcount_delta;/* dquot realtime blk changes */
+ long qt_delrtb_delta; /* delayed RT blk count changes */
+} xfs_dqtrx_t;
+
+#ifdef CONFIG_XFS_QUOTA
+extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
+extern void xfs_trans_free_dqinfo(struct xfs_trans *);
+extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
+ uint, long);
+extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
+extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
+ struct xfs_inode *, long, long, uint);
+extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+ struct xfs_mount *, struct xfs_dquot *,
+ struct xfs_dquot *, long, long, uint);
+
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
+ struct xfs_dquot **, struct xfs_dquot **);
+extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
+ struct xfs_dquot *, struct xfs_dquot *);
+extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
+extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
+ struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
+extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
+ struct xfs_dquot *, struct xfs_dquot *, uint);
+extern int xfs_qm_dqattach(struct xfs_inode *, uint);
+extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
+extern void xfs_qm_dqdetach(struct xfs_inode *);
+extern void xfs_qm_dqrele(struct xfs_dquot *);
+extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
+extern int xfs_qm_sync(struct xfs_mount *, int);
+extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
+extern void xfs_qm_mount_quotas(struct xfs_mount *);
+extern void xfs_qm_unmount(struct xfs_mount *);
+extern void xfs_qm_unmount_quotas(struct xfs_mount *);
+
+#else
+static inline int
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
+ uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
+{
+ *udqp = NULL;
+ *gdqp = NULL;
+ return 0;
+}
+#define xfs_trans_dup_dqinfo(tp, tp2)
+#define xfs_trans_free_dqinfo(tp)
+#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
+#define xfs_trans_apply_dquot_deltas(tp)
+#define xfs_trans_unreserve_and_mod_dquots(tp)
+static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
+ struct xfs_inode *ip, long nblks, long ninos, uint flags)
+{
+ return 0;
+}
+static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
+ struct xfs_mount *mp, struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp, long nblks, long nions, uint flags)
+{
+ return 0;
+}
+#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
+#define xfs_qm_vop_rename_dqattach(it) (0)
+#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
+#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl) (0)
+#define xfs_qm_dqattach(ip, fl) (0)
+#define xfs_qm_dqattach_locked(ip, fl) (0)
+#define xfs_qm_dqdetach(ip)
+#define xfs_qm_dqrele(d)
+#define xfs_qm_statvfs(ip, s)
+static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
+{
+ return 0;
+}
+#define xfs_qm_newmount(mp, a, b) (0)
+#define xfs_qm_mount_quotas(mp)
+#define xfs_qm_unmount(mp)
+#define xfs_qm_unmount_quotas(mp)
+#endif /* CONFIG_XFS_QUOTA */
+
+#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
+ xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
+#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \
+ xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
+ f | XFS_QMOPT_RES_REGBLKS)
+
+extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
+ xfs_dqid_t, uint, uint, char *);
+extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
+
+#endif /* __KERNEL__ */
+#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
new file mode 100644
index 00000000..77a59891
--- /dev/null
+++ b/fs/xfs/xfs_rename.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_utils.h"
+#include "xfs_trans_space.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+
+/*
+ * Enter all inodes for a rename transaction into a sorted array.
+ */
+STATIC void
+xfs_sort_for_rename(
+ xfs_inode_t *dp1, /* in: old (source) directory inode */
+ xfs_inode_t *dp2, /* in: new (target) directory inode */
+ xfs_inode_t *ip1, /* in: inode of old entry */
+ xfs_inode_t *ip2, /* in: inode of new entry, if it
+ already exists, NULL otherwise. */
+ xfs_inode_t **i_tab,/* out: array of inode returned, sorted */
+ int *num_inodes) /* out: number of inodes in array */
+{
+ xfs_inode_t *temp;
+ int i, j;
+
+ /*
+ * i_tab contains a list of pointers to inodes. We initialize
+ * the table here & we'll sort it. We will then use it to
+ * order the acquisition of the inode locks.
+ *
+ * Note that the table may contain duplicates. e.g., dp1 == dp2.
+ */
+ i_tab[0] = dp1;
+ i_tab[1] = dp2;
+ i_tab[2] = ip1;
+ if (ip2) {
+ *num_inodes = 4;
+ i_tab[3] = ip2;
+ } else {
+ *num_inodes = 3;
+ i_tab[3] = NULL;
+ }
+
+ /*
+ * Sort the elements via bubble sort. (Remember, there are at
+ * most 4 elements to sort, so this is adequate.)
+ */
+ for (i = 0; i < *num_inodes; i++) {
+ for (j = 1; j < *num_inodes; j++) {
+ if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
+ temp = i_tab[j];
+ i_tab[j] = i_tab[j-1];
+ i_tab[j-1] = temp;
+ }
+ }
+ }
+}
+
+/*
+ * xfs_rename
+ */
+int
+xfs_rename(
+ xfs_inode_t *src_dp,
+ struct xfs_name *src_name,
+ xfs_inode_t *src_ip,
+ xfs_inode_t *target_dp,
+ struct xfs_name *target_name,
+ xfs_inode_t *target_ip)
+{
+ xfs_trans_t *tp = NULL;
+ xfs_mount_t *mp = src_dp->i_mount;
+ int new_parent; /* moving to a new dir */
+ int src_is_directory; /* src_name is a directory */
+ int error;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int cancel_flags;
+ int committed;
+ xfs_inode_t *inodes[4];
+ int spaceres;
+ int num_inodes;
+
+ trace_xfs_rename(src_dp, target_dp, src_name, target_name);
+
+ new_parent = (src_dp != target_dp);
+ src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
+
+ if (src_is_directory) {
+ /*
+ * Check for link count overflow on target_dp
+ */
+ if (target_ip == NULL && new_parent &&
+ target_dp->i_d.di_nlink >= XFS_MAXLINK) {
+ error = XFS_ERROR(EMLINK);
+ goto std_return;
+ }
+ }
+
+ xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
+ inodes, &num_inodes);
+
+ xfs_bmap_init(&free_list, &first_block);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
+ error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
+ if (error == ENOSPC) {
+ spaceres = 0;
+ error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
+ }
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto std_return;
+ }
+
+ /*
+ * Attach the dquots to the inodes
+ */
+ error = xfs_qm_vop_rename_dqattach(inodes);
+ if (error) {
+ xfs_trans_cancel(tp, cancel_flags);
+ goto std_return;
+ }
+
+ /*
+ * Lock all the participating inodes. Depending upon whether
+ * the target_name exists in the target directory, and
+ * whether the target directory is the same as the source
+ * directory, we can lock from 2 to 4 inodes.
+ */
+ xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
+
+ /*
+ * Join all the inodes to the transaction. From this point on,
+ * we can rely on either trans_commit or trans_cancel to unlock
+ * them.
+ */
+ xfs_trans_ijoin_ref(tp, src_dp, XFS_ILOCK_EXCL);
+ if (new_parent)
+ xfs_trans_ijoin_ref(tp, target_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, src_ip, XFS_ILOCK_EXCL);
+ if (target_ip)
+ xfs_trans_ijoin_ref(tp, target_ip, XFS_ILOCK_EXCL);
+
+ /*
+ * If we are using project inheritance, we only allow renames
+ * into our tree when the project IDs are the same; else the
+ * tree quota mechanism would be circumvented.
+ */
+ if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+ (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
+ error = XFS_ERROR(EXDEV);
+ goto error_return;
+ }
+
+ /*
+ * Set up the target.
+ */
+ if (target_ip == NULL) {
+ /*
+ * If there's no space reservation, check the entry will
+ * fit before actually inserting it.
+ */
+ error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
+ if (error)
+ goto error_return;
+ /*
+ * If target does not exist and the rename crosses
+ * directories, adjust the target directory link count
+ * to account for the ".." reference from the new entry.
+ */
+ error = xfs_dir_createname(tp, target_dp, target_name,
+ src_ip->i_ino, &first_block,
+ &free_list, spaceres);
+ if (error == ENOSPC)
+ goto error_return;
+ if (error)
+ goto abort_return;
+
+ xfs_trans_ichgtime(tp, target_dp,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ if (new_parent && src_is_directory) {
+ error = xfs_bumplink(tp, target_dp);
+ if (error)
+ goto abort_return;
+ }
+ } else { /* target_ip != NULL */
+ /*
+ * If target exists and it's a directory, check that both
+ * target and source are directories and that target can be
+ * destroyed, or that neither is a directory.
+ */
+ if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+ /*
+ * Make sure target dir is empty.
+ */
+ if (!(xfs_dir_isempty(target_ip)) ||
+ (target_ip->i_d.di_nlink > 2)) {
+ error = XFS_ERROR(EEXIST);
+ goto error_return;
+ }
+ }
+
+ /*
+ * Link the source inode under the target name.
+ * If the source inode is a directory and we are moving
+ * it across directories, its ".." entry will be
+ * inconsistent until we replace that down below.
+ *
+ * In case there is already an entry with the same
+ * name at the destination directory, remove it first.
+ */
+ error = xfs_dir_replace(tp, target_dp, target_name,
+ src_ip->i_ino,
+ &first_block, &free_list, spaceres);
+ if (error)
+ goto abort_return;
+
+ xfs_trans_ichgtime(tp, target_dp,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ /*
+ * Decrement the link count on the target since the target
+ * dir no longer points to it.
+ */
+ error = xfs_droplink(tp, target_ip);
+ if (error)
+ goto abort_return;
+
+ if (src_is_directory) {
+ /*
+ * Drop the link from the old "." entry.
+ */
+ error = xfs_droplink(tp, target_ip);
+ if (error)
+ goto abort_return;
+ }
+ } /* target_ip != NULL */
+
+ /*
+ * Remove the source.
+ */
+ if (new_parent && src_is_directory) {
+ /*
+ * Rewrite the ".." entry to point to the new
+ * directory.
+ */
+ error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+ target_dp->i_ino,
+ &first_block, &free_list, spaceres);
+ ASSERT(error != EEXIST);
+ if (error)
+ goto abort_return;
+ }
+
+ /*
+ * We always want to hit the ctime on the source inode.
+ *
+ * This isn't strictly required by the standards since the source
+ * inode isn't really being changed, but old unix file systems did
+ * it and some incremental backup programs won't work without it.
+ */
+ xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
+
+ /*
+ * Adjust the link count on src_dp. This is necessary when
+ * renaming a directory, either within one parent when
+ * the target existed, or across two parent directories.
+ */
+ if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+ /*
+ * Decrement link count on src_directory since the
+ * entry that's moved no longer points to it.
+ */
+ error = xfs_droplink(tp, src_dp);
+ if (error)
+ goto abort_return;
+ }
+
+ error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+ &first_block, &free_list, spaceres);
+ if (error)
+ goto abort_return;
+
+ xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+ if (new_parent)
+ xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * rename transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ xfs_trans_set_sync(tp);
+ }
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error) {
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+ XFS_TRANS_ABORT));
+ goto std_return;
+ }
+
+ /*
+ * trans_commit will unlock src_ip, target_ip & decrement
+ * the vnode references.
+ */
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+ cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+ return error;
+}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
new file mode 100644
index 00000000..8f76fdff
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.c
@@ -0,0 +1,2325 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_fsops.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_inode_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_utils.h"
+#include "xfs_trace.h"
+#include "xfs_buf.h"
+
+
+/*
+ * Prototypes for internal functions.
+ */
+
+
+STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+ xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *);
+STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int,
+ xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *);
+STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+ xfs_extlen_t, int, xfs_rtblock_t *, int *);
+STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+ xfs_rtblock_t, xfs_rtblock_t *);
+STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+ xfs_rtblock_t, xfs_rtblock_t *);
+STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int,
+ xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *);
+STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+ xfs_extlen_t, int);
+STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
+ xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *);
+
+/*
+ * Internal functions.
+ */
+
+/*
+ * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ */
+STATIC int /* error */
+xfs_growfs_rt_alloc(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_extlen_t oblocks, /* old count of blocks */
+ xfs_extlen_t nblocks, /* new count of blocks */
+ xfs_inode_t *ip) /* inode (bitmap/summary) */
+{
+ xfs_fileoff_t bno; /* block number in file */
+ xfs_buf_t *bp; /* temporary buffer for zeroing */
+ int committed; /* transaction committed flag */
+ xfs_daddr_t d; /* disk block address */
+ int error; /* error return value */
+ xfs_fsblock_t firstblock; /* first block allocated in xaction */
+ xfs_bmap_free_t flist; /* list of freed blocks */
+ xfs_fsblock_t fsbno; /* filesystem block for bno */
+ xfs_bmbt_irec_t map; /* block map output */
+ int nmap; /* number of block maps */
+ int resblks; /* space reservation */
+
+ /*
+ * Allocate space to the file, as necessary.
+ */
+ while (oblocks < nblocks) {
+ int cancelflags = 0;
+ xfs_trans_t *tp;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
+ resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
+ /*
+ * Reserve space & log for one extent added to the file.
+ */
+ if ((error = xfs_trans_reserve(tp, resblks,
+ XFS_GROWRTALLOC_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_DEFAULT_PERM_LOG_COUNT)))
+ goto error_cancel;
+ cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+ /*
+ * Lock the inode.
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+
+ xfs_bmap_init(&flist, &firstblock);
+ /*
+ * Allocate blocks to the bitmap file.
+ */
+ nmap = 1;
+ cancelflags |= XFS_TRANS_ABORT;
+ error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
+ XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
+ resblks, &map, &nmap, &flist);
+ if (!error && nmap < 1)
+ error = XFS_ERROR(ENOSPC);
+ if (error)
+ goto error_cancel;
+ /*
+ * Free any blocks freed up in the transaction, then commit.
+ */
+ error = xfs_bmap_finish(&tp, &flist, &committed);
+ if (error)
+ goto error_cancel;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error;
+ /*
+ * Now we need to clear the allocated blocks.
+ * Do this one block per transaction, to keep it simple.
+ */
+ cancelflags = 0;
+ for (bno = map.br_startoff, fsbno = map.br_startblock;
+ bno < map.br_startoff + map.br_blockcount;
+ bno++, fsbno++) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
+ /*
+ * Reserve log for one block zeroing.
+ */
+ if ((error = xfs_trans_reserve(tp, 0,
+ XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
+ goto error_cancel;
+ /*
+ * Lock the bitmap inode.
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+ /*
+ * Get a buffer for the block.
+ */
+ d = XFS_FSB_TO_DADDR(mp, fsbno);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize, 0);
+ if (bp == NULL) {
+ error = XFS_ERROR(EIO);
+error_cancel:
+ xfs_trans_cancel(tp, cancelflags);
+ goto error;
+ }
+ memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
+ xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
+ /*
+ * Commit the transaction.
+ */
+ error = xfs_trans_commit(tp, 0);
+ if (error)
+ goto error;
+ }
+ /*
+ * Go on to the next extent, if any.
+ */
+ oblocks = map.br_startoff + map.br_blockcount;
+ }
+ return 0;
+error:
+ return error;
+}
+
+/*
+ * Attempt to allocate an extent minlen<=len<=maxlen starting from
+ * bitmap block bbno. If we don't get maxlen then use prod to trim
+ * the length, if given. Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int /* error */
+xfs_rtallocate_extent_block(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ xfs_extlen_t minlen, /* minimum length to allocate */
+ xfs_extlen_t maxlen, /* maximum length to allocate */
+ xfs_extlen_t *len, /* out: actual length allocated */
+ xfs_rtblock_t *nextp, /* out: next block to try */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ xfs_extlen_t prod, /* extent product factor */
+ xfs_rtblock_t *rtblock) /* out: start block allocated */
+{
+ xfs_rtblock_t besti; /* best rtblock found so far */
+ xfs_rtblock_t bestlen; /* best length found so far */
+ xfs_rtblock_t end; /* last rtblock in chunk */
+ int error; /* error value */
+ xfs_rtblock_t i; /* current rtblock trying */
+ xfs_rtblock_t next; /* next rtblock to try */
+ int stat; /* status from internal calls */
+
+ /*
+ * Loop over all the extents starting in this bitmap block,
+ * looking for one that's long enough.
+ */
+ for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
+ end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+ i <= end;
+ i++) {
+ /*
+ * See if there's a free extent of maxlen starting at i.
+ * If it's not so then next will contain the first non-free.
+ */
+ error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+ if (error) {
+ return error;
+ }
+ if (stat) {
+ /*
+ * i for maxlen is all free, allocate and return that.
+ */
+ error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
+ rsb);
+ if (error) {
+ return error;
+ }
+ *len = maxlen;
+ *rtblock = i;
+ return 0;
+ }
+ /*
+ * In the case where we have a variable-sized allocation
+ * request, figure out how big this free piece is,
+ * and if it's big enough for the minimum, and the best
+ * so far, remember it.
+ */
+ if (minlen < maxlen) {
+ xfs_rtblock_t thislen; /* this extent size */
+
+ thislen = next - i;
+ if (thislen >= minlen && thislen > bestlen) {
+ besti = i;
+ bestlen = thislen;
+ }
+ }
+ /*
+ * If not done yet, find the start of the next free space.
+ */
+ if (next < end) {
+ error = xfs_rtfind_forw(mp, tp, next, end, &i);
+ if (error) {
+ return error;
+ }
+ } else
+ break;
+ }
+ /*
+ * Searched the whole thing & didn't find a maxlen free extent.
+ */
+ if (minlen < maxlen && besti != -1) {
+ xfs_extlen_t p; /* amount to trim length by */
+
+ /*
+ * If size should be a multiple of prod, make that so.
+ */
+ if (prod > 1 && (p = do_mod(bestlen, prod)))
+ bestlen -= p;
+ /*
+ * Allocate besti for bestlen & return that.
+ */
+ error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ *len = bestlen;
+ *rtblock = besti;
+ return 0;
+ }
+ /*
+ * Allocation failed. Set *nextp to the next block to try.
+ */
+ *nextp = next;
+ *rtblock = NULLRTBLOCK;
+ return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting at block
+ * bno. If we don't get maxlen then use prod to trim the length, if given.
+ * Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int /* error */
+xfs_rtallocate_extent_exact(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number to allocate */
+ xfs_extlen_t minlen, /* minimum length to allocate */
+ xfs_extlen_t maxlen, /* maximum length to allocate */
+ xfs_extlen_t *len, /* out: actual length allocated */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ xfs_extlen_t prod, /* extent product factor */
+ xfs_rtblock_t *rtblock) /* out: start block allocated */
+{
+ int error; /* error value */
+ xfs_extlen_t i; /* extent length trimmed due to prod */
+ int isfree; /* extent is free */
+ xfs_rtblock_t next; /* next block to try (dummy) */
+
+ ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ /*
+ * Check if the range in question (for maxlen) is free.
+ */
+ error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
+ if (error) {
+ return error;
+ }
+ if (isfree) {
+ /*
+ * If it is, allocate it and return success.
+ */
+ error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ *len = maxlen;
+ *rtblock = bno;
+ return 0;
+ }
+ /*
+ * If not, allocate what there is, if it's at least minlen.
+ */
+ maxlen = next - bno;
+ if (maxlen < minlen) {
+ /*
+ * Failed, return failure status.
+ */
+ *rtblock = NULLRTBLOCK;
+ return 0;
+ }
+ /*
+ * Trim off tail of extent, if prod is specified.
+ */
+ if (prod > 1 && (i = maxlen % prod)) {
+ maxlen -= i;
+ if (maxlen < minlen) {
+ /*
+ * Now we can't do it, return failure status.
+ */
+ *rtblock = NULLRTBLOCK;
+ return 0;
+ }
+ }
+ /*
+ * Allocate what we can and return it.
+ */
+ error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ *len = maxlen;
+ *rtblock = bno;
+ return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting as near
+ * to bno as possible. If we don't get maxlen then use prod to trim
+ * the length, if given. The lengths are all in rtextents.
+ */
+STATIC int /* error */
+xfs_rtallocate_extent_near(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number to allocate */
+ xfs_extlen_t minlen, /* minimum length to allocate */
+ xfs_extlen_t maxlen, /* maximum length to allocate */
+ xfs_extlen_t *len, /* out: actual length allocated */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ xfs_extlen_t prod, /* extent product factor */
+ xfs_rtblock_t *rtblock) /* out: start block allocated */
+{
+ int any; /* any useful extents from summary */
+ xfs_rtblock_t bbno; /* bitmap block number */
+ int error; /* error value */
+ int i; /* bitmap block offset (loop control) */
+ int j; /* secondary loop control */
+ int log2len; /* log2 of minlen */
+ xfs_rtblock_t n; /* next block to try */
+ xfs_rtblock_t r; /* result block */
+
+ ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ /*
+ * If the block number given is off the end, silently set it to
+ * the last block.
+ */
+ if (bno >= mp->m_sb.sb_rextents)
+ bno = mp->m_sb.sb_rextents - 1;
+ /*
+ * Try the exact allocation first.
+ */
+ error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
+ rbpp, rsb, prod, &r);
+ if (error) {
+ return error;
+ }
+ /*
+ * If the exact allocation worked, return that.
+ */
+ if (r != NULLRTBLOCK) {
+ *rtblock = r;
+ return 0;
+ }
+ bbno = XFS_BITTOBLOCK(mp, bno);
+ i = 0;
+ ASSERT(minlen != 0);
+ log2len = xfs_highbit32(minlen);
+ /*
+ * Loop over all bitmap blocks (bbno + i is current block).
+ */
+ for (;;) {
+ /*
+ * Get summary information of extents of all useful levels
+ * starting in this bitmap block.
+ */
+ error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
+ bbno + i, rbpp, rsb, &any);
+ if (error) {
+ return error;
+ }
+ /*
+ * If there are any useful extents starting here, try
+ * allocating one.
+ */
+ if (any) {
+ /*
+ * On the positive side of the starting location.
+ */
+ if (i >= 0) {
+ /*
+ * Try to allocate an extent starting in
+ * this block.
+ */
+ error = xfs_rtallocate_extent_block(mp, tp,
+ bbno + i, minlen, maxlen, len, &n, rbpp,
+ rsb, prod, &r);
+ if (error) {
+ return error;
+ }
+ /*
+ * If it worked, return it.
+ */
+ if (r != NULLRTBLOCK) {
+ *rtblock = r;
+ return 0;
+ }
+ }
+ /*
+ * On the negative side of the starting location.
+ */
+ else { /* i < 0 */
+ /*
+ * Loop backwards through the bitmap blocks from
+ * the starting point-1 up to where we are now.
+ * There should be an extent which ends in this
+ * bitmap block and is long enough.
+ */
+ for (j = -1; j > i; j--) {
+ /*
+ * Grab the summary information for
+ * this bitmap block.
+ */
+ error = xfs_rtany_summary(mp, tp,
+ log2len, mp->m_rsumlevels - 1,
+ bbno + j, rbpp, rsb, &any);
+ if (error) {
+ return error;
+ }
+ /*
+ * If there's no extent given in the
+ * summary that means the extent we
+ * found must carry over from an
+ * earlier block. If there is an
+ * extent given, we've already tried
+ * that allocation, don't do it again.
+ */
+ if (any)
+ continue;
+ error = xfs_rtallocate_extent_block(mp,
+ tp, bbno + j, minlen, maxlen,
+ len, &n, rbpp, rsb, prod, &r);
+ if (error) {
+ return error;
+ }
+ /*
+ * If it works, return the extent.
+ */
+ if (r != NULLRTBLOCK) {
+ *rtblock = r;
+ return 0;
+ }
+ }
+ /*
+ * There weren't intervening bitmap blocks
+ * with a long enough extent, or the
+ * allocation didn't work for some reason
+ * (i.e. it's a little * too short).
+ * Try to allocate from the summary block
+ * that we found.
+ */
+ error = xfs_rtallocate_extent_block(mp, tp,
+ bbno + i, minlen, maxlen, len, &n, rbpp,
+ rsb, prod, &r);
+ if (error) {
+ return error;
+ }
+ /*
+ * If it works, return the extent.
+ */
+ if (r != NULLRTBLOCK) {
+ *rtblock = r;
+ return 0;
+ }
+ }
+ }
+ /*
+ * Loop control. If we were on the positive side, and there's
+ * still more blocks on the negative side, go there.
+ */
+ if (i > 0 && (int)bbno - i >= 0)
+ i = -i;
+ /*
+ * If positive, and no more negative, but there are more
+ * positive, go there.
+ */
+ else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1)
+ i++;
+ /*
+ * If negative or 0 (just started), and there are positive
+ * blocks to go, go there. The 0 case moves to block 1.
+ */
+ else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1)
+ i = 1 - i;
+ /*
+ * If negative or 0 and there are more negative blocks,
+ * go there.
+ */
+ else if (i <= 0 && (int)bbno + i > 0)
+ i--;
+ /*
+ * Must be done. Return failure.
+ */
+ else
+ break;
+ }
+ *rtblock = NULLRTBLOCK;
+ return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, with no position
+ * specified. If we don't get maxlen then use prod to trim
+ * the length, if given. The lengths are all in rtextents.
+ */
+STATIC int /* error */
+xfs_rtallocate_extent_size(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_extlen_t minlen, /* minimum length to allocate */
+ xfs_extlen_t maxlen, /* maximum length to allocate */
+ xfs_extlen_t *len, /* out: actual length allocated */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ xfs_extlen_t prod, /* extent product factor */
+ xfs_rtblock_t *rtblock) /* out: start block allocated */
+{
+ int error; /* error value */
+ int i; /* bitmap block number */
+ int l; /* level number (loop control) */
+ xfs_rtblock_t n; /* next block to be tried */
+ xfs_rtblock_t r; /* result block number */
+ xfs_suminfo_t sum; /* summary information for extents */
+
+ ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ ASSERT(maxlen != 0);
+
+ /*
+ * Loop over all the levels starting with maxlen.
+ * At each level, look at all the bitmap blocks, to see if there
+ * are extents starting there that are long enough (>= maxlen).
+ * Note, only on the initial level can the allocation fail if
+ * the summary says there's an extent.
+ */
+ for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
+ /*
+ * Loop over all the bitmap blocks.
+ */
+ for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+ /*
+ * Get the summary for this level/block.
+ */
+ error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+ &sum);
+ if (error) {
+ return error;
+ }
+ /*
+ * Nothing there, on to the next block.
+ */
+ if (!sum)
+ continue;
+ /*
+ * Try allocating the extent.
+ */
+ error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
+ maxlen, len, &n, rbpp, rsb, prod, &r);
+ if (error) {
+ return error;
+ }
+ /*
+ * If it worked, return that.
+ */
+ if (r != NULLRTBLOCK) {
+ *rtblock = r;
+ return 0;
+ }
+ /*
+ * If the "next block to try" returned from the
+ * allocator is beyond the next bitmap block,
+ * skip to that bitmap block.
+ */
+ if (XFS_BITTOBLOCK(mp, n) > i + 1)
+ i = XFS_BITTOBLOCK(mp, n) - 1;
+ }
+ }
+ /*
+ * Didn't find any maxlen blocks. Try smaller ones, unless
+ * we're asking for a fixed size extent.
+ */
+ if (minlen > --maxlen) {
+ *rtblock = NULLRTBLOCK;
+ return 0;
+ }
+ ASSERT(minlen != 0);
+ ASSERT(maxlen != 0);
+
+ /*
+ * Loop over sizes, from maxlen down to minlen.
+ * This time, when we do the allocations, allow smaller ones
+ * to succeed.
+ */
+ for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
+ /*
+ * Loop over all the bitmap blocks, try an allocation
+ * starting in that block.
+ */
+ for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+ /*
+ * Get the summary information for this level/block.
+ */
+ error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+ &sum);
+ if (error) {
+ return error;
+ }
+ /*
+ * If nothing there, go on to next.
+ */
+ if (!sum)
+ continue;
+ /*
+ * Try the allocation. Make sure the specified
+ * minlen/maxlen are in the possible range for
+ * this summary level.
+ */
+ error = xfs_rtallocate_extent_block(mp, tp, i,
+ XFS_RTMAX(minlen, 1 << l),
+ XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
+ len, &n, rbpp, rsb, prod, &r);
+ if (error) {
+ return error;
+ }
+ /*
+ * If it worked, return that extent.
+ */
+ if (r != NULLRTBLOCK) {
+ *rtblock = r;
+ return 0;
+ }
+ /*
+ * If the "next block to try" returned from the
+ * allocator is beyond the next bitmap block,
+ * skip to that bitmap block.
+ */
+ if (XFS_BITTOBLOCK(mp, n) > i + 1)
+ i = XFS_BITTOBLOCK(mp, n) - 1;
+ }
+ }
+ /*
+ * Got nothing, return failure.
+ */
+ *rtblock = NULLRTBLOCK;
+ return 0;
+}
+
+/*
+ * Mark an extent specified by start and len allocated.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int /* error */
+xfs_rtallocate_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* start block to allocate */
+ xfs_extlen_t len, /* length to allocate */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ xfs_rtblock_t end; /* end of the allocated extent */
+ int error; /* error value */
+ xfs_rtblock_t postblock; /* first block allocated > end */
+ xfs_rtblock_t preblock; /* first block allocated < start */
+
+ end = start + len - 1;
+ /*
+ * Assume we're allocating out of the middle of a free extent.
+ * We need to find the beginning and end of the extent so we can
+ * properly update the summary.
+ */
+ error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ if (error) {
+ return error;
+ }
+ /*
+ * Find the next allocated block (end of free extent).
+ */
+ error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
+ if (error) {
+ return error;
+ }
+ /*
+ * Decrement the summary information corresponding to the entire
+ * (old) free extent.
+ */
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ /*
+ * If there are blocks not being allocated at the front of the
+ * old extent, add summary data for them to be free.
+ */
+ if (preblock < start) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(start - preblock),
+ XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * If there are blocks not being allocated at the end of the
+ * old extent, add summary data for them to be free.
+ */
+ if (postblock > end) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock - end),
+ XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * Modify the bitmap to mark this extent allocated.
+ */
+ error = xfs_rtmodify_range(mp, tp, start, len, 0);
+ return error;
+}
+
+/*
+ * Return whether there are any free extents in the size range given
+ * by low and high, for the bitmap block bbno.
+ */
+STATIC int /* error */
+xfs_rtany_summary(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ int low, /* low log2 extent size */
+ int high, /* high log2 extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ int *stat) /* out: any good extents here? */
+{
+ int error; /* error value */
+ int log; /* loop counter, log2 of ext. size */
+ xfs_suminfo_t sum; /* summary data */
+
+ /*
+ * Loop over logs of extent sizes. Order is irrelevant.
+ */
+ for (log = low; log <= high; log++) {
+ /*
+ * Get one summary datum.
+ */
+ error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+ if (error) {
+ return error;
+ }
+ /*
+ * If there are any, return success.
+ */
+ if (sum) {
+ *stat = 1;
+ return 0;
+ }
+ }
+ /*
+ * Found nothing, return failure.
+ */
+ *stat = 0;
+ return 0;
+}
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+STATIC int /* error */
+xfs_rtbuf_get(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t block, /* block number in bitmap or summary */
+ int issum, /* is summary not bitmap */
+ xfs_buf_t **bpp) /* output: buffer for the block */
+{
+ xfs_buf_t *bp; /* block buffer, result */
+ xfs_daddr_t d; /* disk addr of block */
+ int error; /* error value */
+ xfs_fsblock_t fsb; /* fs block number for block */
+ xfs_inode_t *ip; /* bitmap or summary inode */
+
+ ip = issum ? mp->m_rsumip : mp->m_rbmip;
+ /*
+ * Map from the file offset (block) and inode number to the
+ * file system block.
+ */
+ error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
+ if (error) {
+ return error;
+ }
+ ASSERT(fsb != NULLFSBLOCK);
+ /*
+ * Convert to disk address for buffer cache.
+ */
+ d = XFS_FSB_TO_DADDR(mp, fsb);
+ /*
+ * Read the buffer.
+ */
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+ mp->m_bsize, 0, &bp);
+ if (error) {
+ return error;
+ }
+ ASSERT(bp && !XFS_BUF_GETERROR(bp));
+ *bpp = bp;
+ return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int /* error */
+xfs_rtcheck_alloc_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* out: 1 for allocated, 0 for not */
+{
+ xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
+
+ return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat);
+}
+#endif
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+STATIC int /* error */
+xfs_rtcheck_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block number of extent */
+ xfs_extlen_t len, /* length of extent */
+ int val, /* 1 for free, 0 for allocated */
+ xfs_rtblock_t *new, /* out: first block not matching */
+ int *stat) /* out: 1 for matches, 0 for not */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute starting bitmap block number
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ /*
+ * Read the bitmap block.
+ */
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ /*
+ * Compute the starting word's address, and starting bit.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ /*
+ * 0 (allocated) => all zero's; 1 (free) => all one's.
+ */
+ val = -val;
+ /*
+ * If not starting on a word boundary, deal with the first
+ * (partial) word.
+ */
+ if (bit) {
+ /*
+ * Compute first bit not examined.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ /*
+ * Mask of relevant bits.
+ */
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ val) & mask)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = XFS_RTLOBIT(wdiff) - bit;
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ }
+ i = lastbit - bit;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ val)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Mask of relevant bits.
+ */
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ val) & mask)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * Successful, return.
+ */
+ xfs_trans_brelse(tp, bp);
+ *new = start + i;
+ *stat = 1;
+ return 0;
+}
+
+/*
+ * Copy and transform the summary file, given the old and new
+ * parameters in the mount structures.
+ */
+STATIC int /* error */
+xfs_rtcopy_summary(
+ xfs_mount_t *omp, /* old file system mount point */
+ xfs_mount_t *nmp, /* new file system mount point */
+ xfs_trans_t *tp) /* transaction pointer */
+{
+ xfs_rtblock_t bbno; /* bitmap block number */
+ xfs_buf_t *bp; /* summary buffer */
+ int error; /* error return value */
+ int log; /* summary level number (log length) */
+ xfs_suminfo_t sum; /* summary data */
+ xfs_fsblock_t sumbno; /* summary block number */
+
+ bp = NULL;
+ for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
+ for (bbno = omp->m_sb.sb_rbmblocks - 1;
+ (xfs_srtblock_t)bbno >= 0;
+ bbno--) {
+ error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
+ &sumbno, &sum);
+ if (error)
+ return error;
+ if (sum == 0)
+ continue;
+ error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
+ &bp, &sumbno);
+ if (error)
+ return error;
+ error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
+ &bp, &sumbno);
+ if (error)
+ return error;
+ ASSERT(sum > 0);
+ }
+ }
+ return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+STATIC int /* error */
+xfs_rtfind_back(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to look at */
+ xfs_rtblock_t limit, /* last block to look at */
+ xfs_rtblock_t *rtblock) /* out: start block found */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t firstbit; /* first useful bit in the word */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute and read in starting bitmap block for starting block.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ /*
+ * Get the first word's index & point to it.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ len = start - limit + 1;
+ /*
+ * Compute match value, based on the bit at start: if 1 (free)
+ * then all-ones, else all-zeroes.
+ */
+ want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ /*
+ * If the starting position is not word-aligned, deal with the
+ * partial word.
+ */
+ if (bit < XFS_NBWORD - 1) {
+ /*
+ * Calculate first (leftmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+ mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+ firstbit;
+ /*
+ * Calculate the difference between the value there
+ * and what we're looking for.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different. Mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = bit - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ }
+ i = bit - firstbit + 1;
+ /*
+ * Go on to previous block if that's where the previous word is
+ * and we need the previous word.
+ */
+ if (--word == -1 && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ word = XFS_BLOCKWMASK(mp);
+ b = &bufp[word];
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b--;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the previous one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ want)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to previous block if that's where the previous word is
+ * and we need the previous word.
+ */
+ if (--word == -1 && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ word = XFS_BLOCKWMASK(mp);
+ b = &bufp[word];
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b--;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if (len - i) {
+ /*
+ * Calculate first (leftmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ firstbit = XFS_NBWORD - (len - i);
+ mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * No match, return that we scanned the whole area.
+ */
+ xfs_trans_brelse(tp, bp);
+ *rtblock = start - i + 1;
+ return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+STATIC int /* error */
+xfs_rtfind_forw(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to look at */
+ xfs_rtblock_t limit, /* last block to look at */
+ xfs_rtblock_t *rtblock) /* out: start block found */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t lastbit; /* last useful bit in the word */
+ xfs_rtblock_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute and read in starting bitmap block for starting block.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ /*
+ * Get the first word's index & point to it.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ len = limit - start + 1;
+ /*
+ * Compute match value, based on the bit at start: if 1 (free)
+ * then all-ones, else all-zeroes.
+ */
+ want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ /*
+ * If the starting position is not word-aligned, deal with the
+ * partial word.
+ */
+ if (bit) {
+ /*
+ * Calculate last (rightmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Calculate the difference between the value there
+ * and what we're looking for.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different. Mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = XFS_RTLOBIT(wdiff) - bit;
+ *rtblock = start + i - 1;
+ return 0;
+ }
+ i = lastbit - bit;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ word = 0;
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ want)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *rtblock = start + i - 1;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Calculate mask for all the relevant bits in this word.
+ */
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *rtblock = start + i - 1;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * No match, return that we scanned the whole area.
+ */
+ xfs_trans_brelse(tp, bp);
+ *rtblock = start + i - 1;
+ return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int /* error */
+xfs_rtfree_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to free */
+ xfs_extlen_t len, /* length to free */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ xfs_rtblock_t end; /* end of the freed extent */
+ int error; /* error value */
+ xfs_rtblock_t postblock; /* first block freed > end */
+ xfs_rtblock_t preblock; /* first block freed < start */
+
+ end = start + len - 1;
+ /*
+ * Modify the bitmap to mark this extent freed.
+ */
+ error = xfs_rtmodify_range(mp, tp, start, len, 1);
+ if (error) {
+ return error;
+ }
+ /*
+ * Assume we're freeing out of the middle of an allocated extent.
+ * We need to find the beginning and end of the extent so we can
+ * properly update the summary.
+ */
+ error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ if (error) {
+ return error;
+ }
+ /*
+ * Find the next allocated block (end of allocated extent).
+ */
+ error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
+ if (error)
+ return error;
+ /*
+ * If there are blocks not being freed at the front of the
+ * old extent, add summary data for them to be allocated.
+ */
+ if (preblock < start) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(start - preblock),
+ XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * If there are blocks not being freed at the end of the
+ * old extent, add summary data for them to be allocated.
+ */
+ if (postblock > end) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock - end),
+ XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * Increment the summary information corresponding to the entire
+ * (new) free extent.
+ */
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+ return error;
+}
+
+/*
+ * Read and return the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+STATIC int /* error */
+xfs_rtget_summary(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ int log, /* log2 of extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
+{
+ xfs_buf_t *bp; /* buffer for summary block */
+ int error; /* error value */
+ xfs_fsblock_t sb; /* summary fsblock */
+ int so; /* index into the summary file */
+ xfs_suminfo_t *sp; /* pointer to returned data */
+
+ /*
+ * Compute entry number in the summary file.
+ */
+ so = XFS_SUMOFFS(mp, log, bbno);
+ /*
+ * Compute the block number in the summary file.
+ */
+ sb = XFS_SUMOFFSTOBLOCK(mp, so);
+ /*
+ * If we have an old buffer, and the block number matches, use that.
+ */
+ if (rbpp && *rbpp && *rsb == sb)
+ bp = *rbpp;
+ /*
+ * Otherwise we have to get the buffer.
+ */
+ else {
+ /*
+ * If there was an old one, get rid of it first.
+ */
+ if (rbpp && *rbpp)
+ xfs_trans_brelse(tp, *rbpp);
+ error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+ if (error) {
+ return error;
+ }
+ /*
+ * Remember this buffer and block for the next call.
+ */
+ if (rbpp) {
+ *rbpp = bp;
+ *rsb = sb;
+ }
+ }
+ /*
+ * Point to the summary information & copy it out.
+ */
+ sp = XFS_SUMPTR(mp, bp, so);
+ *sum = *sp;
+ /*
+ * Drop the buffer if we're not asked to remember it.
+ */
+ if (!rbpp)
+ xfs_trans_brelse(tp, bp);
+ return 0;
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+STATIC int /* error */
+xfs_rtmodify_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to modify */
+ xfs_extlen_t len, /* length of extent to modify */
+ int val) /* 1 for free, 0 for allocated */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtword_t *first; /* first used word in the buffer */
+ int i; /* current bit number rel. to start */
+ int lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask o frelevant bits for value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute starting bitmap block number.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ /*
+ * Read the bitmap block, and point to its data.
+ */
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ /*
+ * Compute the starting word's address, and starting bit.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ first = b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ /*
+ * 0 (allocated) => all zeroes; 1 (free) => all ones.
+ */
+ val = -val;
+ /*
+ * If not starting on a word boundary, deal with the first
+ * (partial) word.
+ */
+ if (bit) {
+ /*
+ * Compute first bit not changed and mask of relevant bits.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Set/clear the active bits.
+ */
+ if (val)
+ *b |= mask;
+ else
+ *b &= ~mask;
+ i = lastbit - bit;
+ /*
+ * Go on to the next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * Log the changed part of this block.
+ * Get the next one.
+ */
+ xfs_trans_log_buf(tp, bp,
+ (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp));
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Set the word value correctly.
+ */
+ *b = val;
+ i += XFS_NBWORD;
+ /*
+ * Go on to the next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * Log the changed part of this block.
+ * Get the next one.
+ */
+ xfs_trans_log_buf(tp, bp,
+ (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp));
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Compute a mask of relevant bits.
+ */
+ bit = 0;
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Set/clear the active bits.
+ */
+ if (val)
+ *b |= mask;
+ else
+ *b &= ~mask;
+ b++;
+ }
+ /*
+ * Log any remaining changed bytes.
+ */
+ if (b > first)
+ xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp - 1));
+ return 0;
+}
+
+/*
+ * Read and modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+STATIC int /* error */
+xfs_rtmodify_summary(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ int log, /* log2 of extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ int delta, /* change to make to summary info */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ xfs_buf_t *bp; /* buffer for the summary block */
+ int error; /* error value */
+ xfs_fsblock_t sb; /* summary fsblock */
+ int so; /* index into the summary file */
+ xfs_suminfo_t *sp; /* pointer to returned data */
+
+ /*
+ * Compute entry number in the summary file.
+ */
+ so = XFS_SUMOFFS(mp, log, bbno);
+ /*
+ * Compute the block number in the summary file.
+ */
+ sb = XFS_SUMOFFSTOBLOCK(mp, so);
+ /*
+ * If we have an old buffer, and the block number matches, use that.
+ */
+ if (rbpp && *rbpp && *rsb == sb)
+ bp = *rbpp;
+ /*
+ * Otherwise we have to get the buffer.
+ */
+ else {
+ /*
+ * If there was an old one, get rid of it first.
+ */
+ if (rbpp && *rbpp)
+ xfs_trans_brelse(tp, *rbpp);
+ error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+ if (error) {
+ return error;
+ }
+ /*
+ * Remember this buffer and block for the next call.
+ */
+ if (rbpp) {
+ *rbpp = bp;
+ *rsb = sb;
+ }
+ }
+ /*
+ * Point to the summary information, modify and log it.
+ */
+ sp = XFS_SUMPTR(mp, bp, so);
+ *sp += delta;
+ xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)),
+ (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1));
+ return 0;
+}
+
+/*
+ * Visible (exported) functions.
+ */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_growfs_rt_t *in) /* growfs rt input struct */
+{
+ xfs_rtblock_t bmbno; /* bitmap block number */
+ xfs_buf_t *bp; /* temporary buffer */
+ int error; /* error return value */
+ xfs_mount_t *nmp; /* new (fake) mount structure */
+ xfs_drfsbno_t nrblocks; /* new number of realtime blocks */
+ xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
+ xfs_drtbno_t nrextents; /* new number of realtime extents */
+ uint8_t nrextslog; /* new log2 of sb_rextents */
+ xfs_extlen_t nrsumblocks; /* new number of summary blocks */
+ uint nrsumlevels; /* new rt summary levels */
+ uint nrsumsize; /* new size of rt summary, bytes */
+ xfs_sb_t *nsbp; /* new superblock */
+ xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */
+ xfs_extlen_t rsumblocks; /* current number of rt summary blks */
+ xfs_sb_t *sbp; /* old superblock */
+ xfs_fsblock_t sumbno; /* summary block number */
+
+ sbp = &mp->m_sb;
+ /*
+ * Initial error checking.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return XFS_ERROR(EPERM);
+ if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
+ (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
+ (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
+ return XFS_ERROR(EINVAL);
+ if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks)))
+ return error;
+ /*
+ * Read in the last block of the device, make sure it exists.
+ */
+ bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+ XFS_FSB_TO_BB(mp, nrblocks - 1),
+ XFS_FSB_TO_B(mp, 1), 0);
+ if (!bp)
+ return EIO;
+ xfs_buf_relse(bp);
+
+ /*
+ * Calculate new parameters. These are the final values to be reached.
+ */
+ nrextents = nrblocks;
+ do_div(nrextents, in->extsize);
+ nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize);
+ nrextslog = xfs_highbit32(nrextents);
+ nrsumlevels = nrextslog + 1;
+ nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
+ nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+ nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+ /*
+ * New summary size can't be more than half the size of
+ * the log. This prevents us from getting a log overflow,
+ * since we'll log basically the whole summary file at once.
+ */
+ if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
+ return XFS_ERROR(EINVAL);
+ /*
+ * Get the old block counts for bitmap and summary inodes.
+ * These can't change since other growfs callers are locked out.
+ */
+ rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size);
+ rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size);
+ /*
+ * Allocate space to the bitmap and summary files, as necessary.
+ */
+ error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
+ if (error)
+ return error;
+ error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
+ if (error)
+ return error;
+ /*
+ * Allocate a new (fake) mount/sb.
+ */
+ nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+ /*
+ * Loop over the bitmap blocks.
+ * We will do everything one bitmap block at a time.
+ * Skip the current block if it is exactly full.
+ * This also deals with the case where there were no rtextents before.
+ */
+ for (bmbno = sbp->sb_rbmblocks -
+ ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
+ bmbno < nrbmblocks;
+ bmbno++) {
+ xfs_trans_t *tp;
+ int cancelflags = 0;
+
+ *nmp = *mp;
+ nsbp = &nmp->m_sb;
+ /*
+ * Calculate new sb and mount fields for this round.
+ */
+ nsbp->sb_rextsize = in->extsize;
+ nsbp->sb_rbmblocks = bmbno + 1;
+ nsbp->sb_rblocks =
+ XFS_RTMIN(nrblocks,
+ nsbp->sb_rbmblocks * NBBY *
+ nsbp->sb_blocksize * nsbp->sb_rextsize);
+ nsbp->sb_rextents = nsbp->sb_rblocks;
+ do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+ ASSERT(nsbp->sb_rextents != 0);
+ nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+ nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
+ nrsumsize =
+ (uint)sizeof(xfs_suminfo_t) * nrsumlevels *
+ nsbp->sb_rbmblocks;
+ nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+ nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+ /*
+ * Start a transaction, get the log reservation.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
+ if ((error = xfs_trans_reserve(tp, 0,
+ XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
+ goto error_cancel;
+ /*
+ * Lock out other callers by grabbing the bitmap inode lock.
+ */
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ /*
+ * Update the bitmap inode's size.
+ */
+ mp->m_rbmip->i_d.di_size =
+ nsbp->sb_rbmblocks * nsbp->sb_blocksize;
+ xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+ cancelflags |= XFS_TRANS_ABORT;
+ /*
+ * Get the summary inode into the transaction.
+ */
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ /*
+ * Update the summary inode's size.
+ */
+ mp->m_rsumip->i_d.di_size = nmp->m_rsumsize;
+ xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
+ /*
+ * Copy summary data from old to new sizes.
+ * Do this when the real size (not block-aligned) changes.
+ */
+ if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
+ mp->m_rsumlevels != nmp->m_rsumlevels) {
+ error = xfs_rtcopy_summary(mp, nmp, tp);
+ if (error)
+ goto error_cancel;
+ }
+ /*
+ * Update superblock fields.
+ */
+ if (nsbp->sb_rextsize != sbp->sb_rextsize)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
+ nsbp->sb_rextsize - sbp->sb_rextsize);
+ if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+ nsbp->sb_rbmblocks - sbp->sb_rbmblocks);
+ if (nsbp->sb_rblocks != sbp->sb_rblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
+ nsbp->sb_rblocks - sbp->sb_rblocks);
+ if (nsbp->sb_rextents != sbp->sb_rextents)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
+ nsbp->sb_rextents - sbp->sb_rextents);
+ if (nsbp->sb_rextslog != sbp->sb_rextslog)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+ nsbp->sb_rextslog - sbp->sb_rextslog);
+ /*
+ * Free new extent.
+ */
+ bp = NULL;
+ error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
+ nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
+ if (error) {
+error_cancel:
+ xfs_trans_cancel(tp, cancelflags);
+ break;
+ }
+ /*
+ * Mark more blocks free in the superblock.
+ */
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
+ nsbp->sb_rextents - sbp->sb_rextents);
+ /*
+ * Update mp values into the real mp structure.
+ */
+ mp->m_rsumlevels = nrsumlevels;
+ mp->m_rsumsize = nrsumsize;
+
+ error = xfs_trans_commit(tp, 0);
+ if (error)
+ break;
+ }
+
+ /*
+ * Free the fake mp structure.
+ */
+ kmem_free(nmp);
+
+ return error;
+}
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters. The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int /* error */
+xfs_rtallocate_extent(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number to allocate */
+ xfs_extlen_t minlen, /* minimum length to allocate */
+ xfs_extlen_t maxlen, /* maximum length to allocate */
+ xfs_extlen_t *len, /* out: actual length allocated */
+ xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */
+ int wasdel, /* was a delayed allocation extent */
+ xfs_extlen_t prod, /* extent product factor */
+ xfs_rtblock_t *rtblock) /* out: start block allocated */
+{
+ xfs_mount_t *mp = tp->t_mountp;
+ int error; /* error value */
+ xfs_rtblock_t r; /* result allocated block */
+ xfs_fsblock_t sb; /* summary file block number */
+ xfs_buf_t *sumbp; /* summary file block buffer */
+
+ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+ ASSERT(minlen > 0 && minlen <= maxlen);
+
+ /*
+ * If prod is set then figure out what to do to minlen and maxlen.
+ */
+ if (prod > 1) {
+ xfs_extlen_t i;
+
+ if ((i = maxlen % prod))
+ maxlen -= i;
+ if ((i = minlen % prod))
+ minlen += prod - i;
+ if (maxlen < minlen) {
+ *rtblock = NULLRTBLOCK;
+ return 0;
+ }
+ }
+
+ sumbp = NULL;
+ /*
+ * Allocate by size, or near another block, or exactly at some block.
+ */
+ switch (type) {
+ case XFS_ALLOCTYPE_ANY_AG:
+ error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
+ &sumbp, &sb, prod, &r);
+ break;
+ case XFS_ALLOCTYPE_NEAR_BNO:
+ error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
+ len, &sumbp, &sb, prod, &r);
+ break;
+ case XFS_ALLOCTYPE_THIS_BNO:
+ error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen,
+ len, &sumbp, &sb, prod, &r);
+ break;
+ default:
+ error = EIO;
+ ASSERT(0);
+ }
+ if (error)
+ return error;
+
+ /*
+ * If it worked, update the superblock.
+ */
+ if (r != NULLRTBLOCK) {
+ long slen = (long)*len;
+
+ ASSERT(*len >= minlen && *len <= maxlen);
+ if (wasdel)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
+ else
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
+ }
+ *rtblock = r;
+ return 0;
+}
+
+/*
+ * Free an extent in the realtime subvolume. Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int /* error */
+xfs_rtfree_extent(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number to free */
+ xfs_extlen_t len) /* length of extent freed */
+{
+ int error; /* error value */
+ xfs_mount_t *mp; /* file system mount structure */
+ xfs_fsblock_t sb; /* summary file block number */
+ xfs_buf_t *sumbp; /* summary file block buffer */
+
+ mp = tp->t_mountp;
+ /*
+ * Synchronize by locking the bitmap inode.
+ */
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+#if defined(__KERNEL__) && defined(DEBUG)
+ /*
+ * Check to see that this whole range is currently allocated.
+ */
+ {
+ int stat; /* result from checking range */
+
+ error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat);
+ if (error) {
+ return error;
+ }
+ ASSERT(stat);
+ }
+#endif
+ sumbp = NULL;
+ /*
+ * Free the range of realtime blocks.
+ */
+ error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+ if (error) {
+ return error;
+ }
+ /*
+ * Mark more blocks free in the superblock.
+ */
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+ /*
+ * If we've now freed all the blocks, reset the file sequence
+ * number to 0.
+ */
+ if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+ mp->m_sb.sb_rextents) {
+ if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+ mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+ *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+ xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+ }
+ return 0;
+}
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int /* error */
+xfs_rtmount_init(
+ xfs_mount_t *mp) /* file system mount structure */
+{
+ xfs_buf_t *bp; /* buffer for last block of subvolume */
+ xfs_daddr_t d; /* address of last block of subvolume */
+ xfs_sb_t *sbp; /* filesystem superblock copy in mount */
+
+ sbp = &mp->m_sb;
+ if (sbp->sb_rblocks == 0)
+ return 0;
+ if (mp->m_rtdev_targp == NULL) {
+ xfs_warn(mp,
+ "Filesystem has a realtime volume, use rtdev=device option");
+ return XFS_ERROR(ENODEV);
+ }
+ mp->m_rsumlevels = sbp->sb_rextslog + 1;
+ mp->m_rsumsize =
+ (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
+ sbp->sb_rbmblocks;
+ mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
+ mp->m_rbmip = mp->m_rsumip = NULL;
+ /*
+ * Check that the realtime section is an ok size.
+ */
+ d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+ if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
+ xfs_warn(mp, "realtime mount -- %llu != %llu",
+ (unsigned long long) XFS_BB_TO_FSB(mp, d),
+ (unsigned long long) mp->m_sb.sb_rblocks);
+ return XFS_ERROR(EFBIG);
+ }
+ bp = xfs_buf_read_uncached(mp, mp->m_rtdev_targp,
+ d - XFS_FSB_TO_BB(mp, 1),
+ XFS_FSB_TO_B(mp, 1), 0);
+ if (!bp) {
+ xfs_warn(mp, "realtime device size check failed");
+ return EIO;
+ }
+ xfs_buf_relse(bp);
+ return 0;
+}
+
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int /* error */
+xfs_rtmount_inodes(
+ xfs_mount_t *mp) /* file system mount structure */
+{
+ int error; /* error return value */
+ xfs_sb_t *sbp;
+
+ sbp = &mp->m_sb;
+ if (sbp->sb_rbmino == NULLFSINO)
+ return 0;
+ error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
+ if (error)
+ return error;
+ ASSERT(mp->m_rbmip != NULL);
+ ASSERT(sbp->sb_rsumino != NULLFSINO);
+ error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
+ if (error) {
+ IRELE(mp->m_rbmip);
+ return error;
+ }
+ ASSERT(mp->m_rsumip != NULL);
+ return 0;
+}
+
+void
+xfs_rtunmount_inodes(
+ struct xfs_mount *mp)
+{
+ if (mp->m_rbmip)
+ IRELE(mp->m_rbmip);
+ if (mp->m_rsumip)
+ IRELE(mp->m_rsumip);
+}
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int /* error */
+xfs_rtpick_extent(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_extlen_t len, /* allocation length (rtextents) */
+ xfs_rtblock_t *pick) /* result rt extent */
+{
+ xfs_rtblock_t b; /* result block */
+ int log2; /* log of sequence number */
+ __uint64_t resid; /* residual after log removed */
+ __uint64_t seq; /* sequence number of file creation */
+ __uint64_t *seqp; /* pointer to seqno in inode */
+
+ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+ seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
+ if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
+ mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+ *seqp = 0;
+ }
+ seq = *seqp;
+ if ((log2 = xfs_highbit64(seq)) == -1)
+ b = 0;
+ else {
+ resid = seq - (1ULL << log2);
+ b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >>
+ (log2 + 1);
+ if (b >= mp->m_sb.sb_rextents)
+ b = do_mod(b, mp->m_sb.sb_rextents);
+ if (b + len > mp->m_sb.sb_rextents)
+ b = mp->m_sb.sb_rextents - len;
+ }
+ *seqp = seq + 1;
+ xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+ *pick = b;
+ return 0;
+}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
new file mode 100644
index 00000000..09e1f4f3
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_RTALLOC_H__
+#define __XFS_RTALLOC_H__
+
+struct xfs_mount;
+struct xfs_trans;
+
+/* Min and max rt extent sizes, specified in bytes */
+#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
+#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
+#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
+
+/*
+ * Constants for bit manipulations.
+ */
+#define XFS_NBBYLOG 3 /* log2(NBBY) */
+#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
+#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
+#define XFS_NBWORD (1 << XFS_NBWORDLOG)
+#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
+
+#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
+#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
+#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
+#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
+
+/*
+ * Summary and bit manipulation macros.
+ */
+#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define XFS_SUMOFFSTOBLOCK(mp,s) \
+ (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_SUMPTR(mp,bp,so) \
+ ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \
+ (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
+#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
+#define XFS_BITTOWORD(mp,bi) \
+ ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
+#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define XFS_RTLOBIT(w) xfs_lowbit32(w)
+#define XFS_RTHIBIT(w) xfs_highbit32(w)
+
+#if XFS_BIG_BLKNOS
+#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
+#else
+#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
+#endif
+
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_XFS_RT
+/*
+ * Function prototypes for exported functions.
+ */
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters. The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int /* error */
+xfs_rtallocate_extent(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number to allocate */
+ xfs_extlen_t minlen, /* minimum length to allocate */
+ xfs_extlen_t maxlen, /* maximum length to allocate */
+ xfs_extlen_t *len, /* out: actual length allocated */
+ xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */
+ int wasdel, /* was a delayed allocation extent */
+ xfs_extlen_t prod, /* extent product factor */
+ xfs_rtblock_t *rtblock); /* out: start block allocated */
+
+/*
+ * Free an extent in the realtime subvolume. Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int /* error */
+xfs_rtfree_extent(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number to free */
+ xfs_extlen_t len); /* length of extent freed */
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int /* error */
+xfs_rtmount_init(
+ struct xfs_mount *mp); /* file system mount structure */
+void
+xfs_rtunmount_inodes(
+ struct xfs_mount *mp);
+
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int /* error */
+xfs_rtmount_inodes(
+ struct xfs_mount *mp); /* file system mount structure */
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int /* error */
+xfs_rtpick_extent(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_extlen_t len, /* allocation length (rtextents) */
+ xfs_rtblock_t *pick); /* result rt extent */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+ struct xfs_mount *mp, /* file system mount structure */
+ xfs_growfs_rt_t *in); /* user supplied growfs struct */
+
+#else
+# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS)
+# define xfs_rtfree_extent(t,b,l) (ENOSYS)
+# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
+# define xfs_growfs_rt(mp,in) (ENOSYS)
+static inline int /* error */
+xfs_rtmount_init(
+ xfs_mount_t *mp) /* file system mount structure */
+{
+ if (mp->m_sb.sb_rblocks == 0)
+ return 0;
+
+ xfs_warn(mp, "Not built with CONFIG_XFS_RT");
+ return ENOSYS;
+}
+# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtunmount_inodes(m)
+#endif /* CONFIG_XFS_RT */
+
+#endif /* __KERNEL__ */
+
+#endif /* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
new file mode 100644
index 00000000..d6d6fdfe
--- /dev/null
+++ b/fs/xfs/xfs_rw.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+
+/*
+ * Force a shutdown of the filesystem instantly while keeping
+ * the filesystem consistent. We don't do an unmount here; just shutdown
+ * the shop, make sure that absolutely nothing persistent happens to
+ * this filesystem after this point.
+ */
+void
+xfs_do_force_shutdown(
+ xfs_mount_t *mp,
+ int flags,
+ char *fname,
+ int lnnum)
+{
+ int logerror;
+
+ logerror = flags & SHUTDOWN_LOG_IO_ERROR;
+
+ if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ xfs_notice(mp,
+ "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
+ __func__, flags, lnnum, fname, __return_address);
+ }
+ /*
+ * No need to duplicate efforts.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
+ return;
+
+ /*
+ * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
+ * queue up anybody new on the log reservations, and wakes up
+ * everybody who's sleeping on log reservations to tell them
+ * the bad news.
+ */
+ if (xfs_log_force_umount(mp, logerror))
+ return;
+
+ if (flags & SHUTDOWN_CORRUPT_INCORE) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
+ "Corruption of in-memory data detected. Shutting down filesystem");
+ if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
+ xfs_stack_trace();
+ } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ if (logerror) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+ "Log I/O Error Detected. Shutting down filesystem");
+ } else if (flags & SHUTDOWN_DEVICE_REQ) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "All device paths lost. Shutting down filesystem");
+ } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "I/O Error Detected. Shutting down filesystem");
+ }
+ }
+ if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+ xfs_alert(mp,
+ "Please umount the filesystem and rectify the problem(s)");
+ }
+}
+
+/*
+ * Prints out an ALERT message about I/O error.
+ */
+void
+xfs_ioerror_alert(
+ char *func,
+ struct xfs_mount *mp,
+ xfs_buf_t *bp,
+ xfs_daddr_t blkno)
+{
+ xfs_alert(mp,
+ "I/O error occurred: meta-data dev %s block 0x%llx"
+ " (\"%s\") error %d buf count %zd",
+ XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
+ (__uint64_t)blkno, func,
+ XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
+}
+
+/*
+ * This isn't an absolute requirement, but it is
+ * just a good idea to call xfs_read_buf instead of
+ * directly doing a read_buf call. For one, we shouldn't
+ * be doing this disk read if we are in SHUTDOWN state anyway,
+ * so this stops that from happening. Secondly, this does all
+ * the error checking stuff and the brelse if appropriate for
+ * the caller, so the code can be a little leaner.
+ */
+
+int
+xfs_read_buf(
+ struct xfs_mount *mp,
+ xfs_buftarg_t *target,
+ xfs_daddr_t blkno,
+ int len,
+ uint flags,
+ xfs_buf_t **bpp)
+{
+ xfs_buf_t *bp;
+ int error;
+
+ if (!flags)
+ flags = XBF_LOCK | XBF_MAPPED;
+
+ bp = xfs_buf_read(target, blkno, len, flags);
+ if (!bp)
+ return XFS_ERROR(EIO);
+ error = XFS_BUF_GETERROR(bp);
+ if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {
+ *bpp = bp;
+ } else {
+ *bpp = NULL;
+ if (error) {
+ xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
+ } else {
+ error = XFS_ERROR(EIO);
+ }
+ if (bp) {
+ XFS_BUF_UNDONE(bp);
+ XFS_BUF_UNDELAYWRITE(bp);
+ XFS_BUF_STALE(bp);
+ /*
+ * brelse clears B_ERROR and b_error
+ */
+ xfs_buf_relse(bp);
+ }
+ }
+ return (error);
+}
+
+/*
+ * helper function to extract extent size hint from inode
+ */
+xfs_extlen_t
+xfs_get_extsz_hint(
+ struct xfs_inode *ip)
+{
+ if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
+ return ip->i_d.di_extsize;
+ if (XFS_IS_REALTIME_INODE(ip))
+ return ip->i_mount->m_sb.sb_rextsize;
+ return 0;
+}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
new file mode 100644
index 00000000..11c41ec6
--- /dev/null
+++ b/fs/xfs/xfs_rw.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_RW_H__
+#define __XFS_RW_H__
+
+struct xfs_buf;
+struct xfs_inode;
+struct xfs_mount;
+
+/*
+ * Convert the given file system block to a disk block.
+ * We have to treat it differently based on whether the
+ * file is a real time file or not, because the bmap code
+ * does.
+ */
+static inline xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+ return (XFS_IS_REALTIME_INODE(ip) ? \
+ (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+ XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+}
+
+/*
+ * Prototypes for functions in xfs_rw.c.
+ */
+extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
+ xfs_daddr_t blkno, int len, uint flags,
+ struct xfs_buf **bpp);
+extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
+ xfs_buf_t *bp, xfs_daddr_t blkno);
+extern xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
+
+#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
new file mode 100644
index 00000000..1eb2ba58
--- /dev/null
+++ b/fs/xfs/xfs_sb.h
@@ -0,0 +1,543 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SB_H__
+#define __XFS_SB_H__
+
+/*
+ * Super block
+ * Fits into a sector-sized buffer at address 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+
+struct xfs_buf;
+struct xfs_mount;
+
+#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */
+#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */
+#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */
+#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */
+#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */
+#define XFS_SB_VERSION_NUMBITS 0x000f
+#define XFS_SB_VERSION_ALLFBITS 0xfff0
+#define XFS_SB_VERSION_SASHFBITS 0xf000
+#define XFS_SB_VERSION_REALFBITS 0x0ff0
+#define XFS_SB_VERSION_ATTRBIT 0x0010
+#define XFS_SB_VERSION_NLINKBIT 0x0020
+#define XFS_SB_VERSION_QUOTABIT 0x0040
+#define XFS_SB_VERSION_ALIGNBIT 0x0080
+#define XFS_SB_VERSION_DALIGNBIT 0x0100
+#define XFS_SB_VERSION_SHAREDBIT 0x0200
+#define XFS_SB_VERSION_LOGV2BIT 0x0400
+#define XFS_SB_VERSION_SECTORBIT 0x0800
+#define XFS_SB_VERSION_EXTFLGBIT 0x1000
+#define XFS_SB_VERSION_DIRV2BIT 0x2000
+#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */
+#define XFS_SB_VERSION_MOREBITSBIT 0x8000
+#define XFS_SB_VERSION_OKSASHFBITS \
+ (XFS_SB_VERSION_EXTFLGBIT | \
+ XFS_SB_VERSION_DIRV2BIT | \
+ XFS_SB_VERSION_BORGBIT)
+#define XFS_SB_VERSION_OKREALFBITS \
+ (XFS_SB_VERSION_ATTRBIT | \
+ XFS_SB_VERSION_NLINKBIT | \
+ XFS_SB_VERSION_QUOTABIT | \
+ XFS_SB_VERSION_ALIGNBIT | \
+ XFS_SB_VERSION_DALIGNBIT | \
+ XFS_SB_VERSION_SHAREDBIT | \
+ XFS_SB_VERSION_LOGV2BIT | \
+ XFS_SB_VERSION_SECTORBIT | \
+ XFS_SB_VERSION_MOREBITSBIT)
+#define XFS_SB_VERSION_OKREALBITS \
+ (XFS_SB_VERSION_NUMBITS | \
+ XFS_SB_VERSION_OKREALFBITS | \
+ XFS_SB_VERSION_OKSASHFBITS)
+
+/*
+ * There are two words to hold XFS "feature" bits: the original
+ * word, sb_versionnum, and sb_features2. Whenever a bit is set in
+ * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
+ *
+ * These defines represent bits in sb_features2.
+ */
+#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */
+#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
+#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
+#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
+#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
+#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
+
+#define XFS_SB_VERSION2_OKREALFBITS \
+ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+ XFS_SB_VERSION2_ATTR2BIT | \
+ XFS_SB_VERSION2_PROJID32BIT)
+#define XFS_SB_VERSION2_OKSASHFBITS \
+ (0)
+#define XFS_SB_VERSION2_OKREALBITS \
+ (XFS_SB_VERSION2_OKREALFBITS | \
+ XFS_SB_VERSION2_OKSASHFBITS )
+
+/*
+ * Superblock - in core version. Must match the ondisk version below.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_sb {
+ __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
+ __uint32_t sb_blocksize; /* logical block size, bytes */
+ xfs_drfsbno_t sb_dblocks; /* number of data blocks */
+ xfs_drfsbno_t sb_rblocks; /* number of realtime blocks */
+ xfs_drtbno_t sb_rextents; /* number of realtime extents */
+ uuid_t sb_uuid; /* file system unique id */
+ xfs_dfsbno_t sb_logstart; /* starting block of log if internal */
+ xfs_ino_t sb_rootino; /* root inode number */
+ xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
+ xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */
+ xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */
+ xfs_agblock_t sb_agblocks; /* size of an allocation group */
+ xfs_agnumber_t sb_agcount; /* number of allocation groups */
+ xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */
+ xfs_extlen_t sb_logblocks; /* number of log blocks */
+ __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */
+ __uint16_t sb_sectsize; /* volume sector size, bytes */
+ __uint16_t sb_inodesize; /* inode size, bytes */
+ __uint16_t sb_inopblock; /* inodes per block */
+ char sb_fname[12]; /* file system name */
+ __uint8_t sb_blocklog; /* log2 of sb_blocksize */
+ __uint8_t sb_sectlog; /* log2 of sb_sectsize */
+ __uint8_t sb_inodelog; /* log2 of sb_inodesize */
+ __uint8_t sb_inopblog; /* log2 of sb_inopblock */
+ __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */
+ __uint8_t sb_rextslog; /* log2 of sb_rextents */
+ __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */
+ __uint8_t sb_imax_pct; /* max % of fs for inode space */
+ /* statistics */
+ /*
+ * These fields must remain contiguous. If you really
+ * want to change their layout, make sure you fix the
+ * code in xfs_trans_apply_sb_deltas().
+ */
+ __uint64_t sb_icount; /* allocated inodes */
+ __uint64_t sb_ifree; /* free inodes */
+ __uint64_t sb_fdblocks; /* free data blocks */
+ __uint64_t sb_frextents; /* free realtime extents */
+ /*
+ * End contiguous fields.
+ */
+ xfs_ino_t sb_uquotino; /* user quota inode */
+ xfs_ino_t sb_gquotino; /* group quota inode */
+ __uint16_t sb_qflags; /* quota flags */
+ __uint8_t sb_flags; /* misc. flags */
+ __uint8_t sb_shared_vn; /* shared version number */
+ xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */
+ __uint32_t sb_unit; /* stripe or raid unit */
+ __uint32_t sb_width; /* stripe or raid width */
+ __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */
+ __uint8_t sb_logsectlog; /* log2 of the log sector size */
+ __uint16_t sb_logsectsize; /* sector size for the log, bytes */
+ __uint32_t sb_logsunit; /* stripe unit size for the log */
+ __uint32_t sb_features2; /* additional feature bits */
+
+ /*
+ * bad features2 field as a result of failing to pad the sb
+ * structure to 64 bits. Some machines will be using this field
+ * for features2 bits. Easiest just to mark it bad and not use
+ * it for anything else.
+ */
+ __uint32_t sb_bad_features2;
+
+ /* must be padded to 64 bit alignment */
+} xfs_sb_t;
+
+/*
+ * Superblock - on disk version. Must match the in core version above.
+ * Must be padded to 64 bit alignment.
+ */
+typedef struct xfs_dsb {
+ __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */
+ __be32 sb_blocksize; /* logical block size, bytes */
+ __be64 sb_dblocks; /* number of data blocks */
+ __be64 sb_rblocks; /* number of realtime blocks */
+ __be64 sb_rextents; /* number of realtime extents */
+ uuid_t sb_uuid; /* file system unique id */
+ __be64 sb_logstart; /* starting block of log if internal */
+ __be64 sb_rootino; /* root inode number */
+ __be64 sb_rbmino; /* bitmap inode for realtime extents */
+ __be64 sb_rsumino; /* summary inode for rt bitmap */
+ __be32 sb_rextsize; /* realtime extent size, blocks */
+ __be32 sb_agblocks; /* size of an allocation group */
+ __be32 sb_agcount; /* number of allocation groups */
+ __be32 sb_rbmblocks; /* number of rt bitmap blocks */
+ __be32 sb_logblocks; /* number of log blocks */
+ __be16 sb_versionnum; /* header version == XFS_SB_VERSION */
+ __be16 sb_sectsize; /* volume sector size, bytes */
+ __be16 sb_inodesize; /* inode size, bytes */
+ __be16 sb_inopblock; /* inodes per block */
+ char sb_fname[12]; /* file system name */
+ __u8 sb_blocklog; /* log2 of sb_blocksize */
+ __u8 sb_sectlog; /* log2 of sb_sectsize */
+ __u8 sb_inodelog; /* log2 of sb_inodesize */
+ __u8 sb_inopblog; /* log2 of sb_inopblock */
+ __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */
+ __u8 sb_rextslog; /* log2 of sb_rextents */
+ __u8 sb_inprogress; /* mkfs is in progress, don't mount */
+ __u8 sb_imax_pct; /* max % of fs for inode space */
+ /* statistics */
+ /*
+ * These fields must remain contiguous. If you really
+ * want to change their layout, make sure you fix the
+ * code in xfs_trans_apply_sb_deltas().
+ */
+ __be64 sb_icount; /* allocated inodes */
+ __be64 sb_ifree; /* free inodes */
+ __be64 sb_fdblocks; /* free data blocks */
+ __be64 sb_frextents; /* free realtime extents */
+ /*
+ * End contiguous fields.
+ */
+ __be64 sb_uquotino; /* user quota inode */
+ __be64 sb_gquotino; /* group quota inode */
+ __be16 sb_qflags; /* quota flags */
+ __u8 sb_flags; /* misc. flags */
+ __u8 sb_shared_vn; /* shared version number */
+ __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */
+ __be32 sb_unit; /* stripe or raid unit */
+ __be32 sb_width; /* stripe or raid width */
+ __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */
+ __u8 sb_logsectlog; /* log2 of the log sector size */
+ __be16 sb_logsectsize; /* sector size for the log, bytes */
+ __be32 sb_logsunit; /* stripe unit size for the log */
+ __be32 sb_features2; /* additional feature bits */
+ /*
+ * bad features2 field as a result of failing to pad the sb
+ * structure to 64 bits. Some machines will be using this field
+ * for features2 bits. Easiest just to mark it bad and not use
+ * it for anything else.
+ */
+ __be32 sb_bad_features2;
+
+ /* must be padded to 64 bit alignment */
+} xfs_dsb_t;
+
+/*
+ * Sequence number values for the fields.
+ */
+typedef enum {
+ XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
+ XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
+ XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
+ XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
+ XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
+ XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
+ XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
+ XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
+ XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
+ XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
+ XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
+ XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
+ XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2,
+ XFS_SBS_FIELDCOUNT
+} xfs_sb_field_t;
+
+/*
+ * Mask values, defined based on the xfs_sb_field_t values.
+ * Only define the ones we're using.
+ */
+#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x)
+#define XFS_SB_UUID XFS_SB_MVAL(UUID)
+#define XFS_SB_FNAME XFS_SB_MVAL(FNAME)
+#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO)
+#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO)
+#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO)
+#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM)
+#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO)
+#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO)
+#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS)
+#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN)
+#define XFS_SB_UNIT XFS_SB_MVAL(UNIT)
+#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH)
+#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT)
+#define XFS_SB_IFREE XFS_SB_MVAL(IFREE)
+#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
+#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2)
+#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2)
+#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
+#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
+#define XFS_SB_MOD_BITS \
+ (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
+ XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
+ XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
+ XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
+ XFS_SB_BAD_FEATURES2)
+
+
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS 0x00 /* no flags set */
+#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */
+
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN 0
+
+#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+
+static inline int xfs_sb_good_version(xfs_sb_t *sbp)
+{
+ /* We always support version 1-3 */
+ if (sbp->sb_versionnum >= XFS_SB_VERSION_1 &&
+ sbp->sb_versionnum <= XFS_SB_VERSION_3)
+ return 1;
+
+ /* We support version 4 if all feature bits are supported */
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) {
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
+ return 0;
+
+#ifdef __KERNEL__
+ if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
+ return 0;
+#else
+ if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
+ sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
+ return 0;
+#endif
+
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Detect a mismatched features2 field. Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
+{
+ return (sbp->sb_bad_features2 != sbp->sb_features2);
+}
+
+static inline unsigned xfs_sb_version_tonew(unsigned v)
+{
+ if (v == XFS_SB_VERSION_1)
+ return XFS_SB_VERSION_4;
+
+ if (v == XFS_SB_VERSION_2)
+ return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
+
+ return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT |
+ XFS_SB_VERSION_NLINKBIT;
+}
+
+static inline unsigned xfs_sb_version_toold(unsigned v)
+{
+ if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT))
+ return 0;
+ if (v & XFS_SB_VERSION_NLINKBIT)
+ return XFS_SB_VERSION_3;
+ if (v & XFS_SB_VERSION_ATTRBIT)
+ return XFS_SB_VERSION_2;
+ return XFS_SB_VERSION_1;
+}
+
+static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
+{
+ return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
+ sbp->sb_versionnum == XFS_SB_VERSION_3 ||
+ (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
+}
+
+static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
+{
+ if (sbp->sb_versionnum == XFS_SB_VERSION_1)
+ sbp->sb_versionnum = XFS_SB_VERSION_2;
+ else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+ sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
+ else
+ sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
+}
+
+static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
+{
+ return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
+ (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
+}
+
+static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
+{
+ if (sbp->sb_versionnum <= XFS_SB_VERSION_2)
+ sbp->sb_versionnum = XFS_SB_VERSION_3;
+ else
+ sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+}
+
+static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+}
+
+static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
+{
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+ sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
+ else
+ sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
+ XFS_SB_VERSION_QUOTABIT;
+}
+
+static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
+}
+
+static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+}
+
+static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
+}
+
+static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+}
+
+static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+}
+
+static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+}
+
+static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+}
+
+static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+}
+
+static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+}
+
+/*
+ * sb_features2 bit version macros.
+ *
+ * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro:
+ *
+ * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp)
+ * ((xfs_sb_version_hasmorebits(sbp) &&
+ * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT)
+ */
+
+static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
+{
+ return xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT);
+}
+
+static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
+{
+ return xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
+}
+
+static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
+{
+ sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+ sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+}
+
+static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
+{
+ sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+ if (!sbp->sb_features2)
+ sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
+}
+
+static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
+{
+ return xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
+}
+
+/*
+ * end of superblock version macros
+ */
+
+#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
+#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)XFS_BUF_PTR(bp))
+
+#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
+#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
+ xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
+#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \
+ XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
+
+/*
+ * File system sector to basic block conversions.
+ */
+#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log)
+
+/*
+ * File system block to basic block conversions.
+ */
+#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSB(mp,bb) \
+ (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log)
+#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1))
+
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b) \
+ ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
+
+#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
new file mode 100644
index 00000000..efc147f0
--- /dev/null
+++ b/fs/xfs/xfs_trans.c
@@ -0,0 +1,2026 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_trans_priv.h"
+#include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+
+kmem_zone_t *xfs_trans_zone;
+kmem_zone_t *xfs_log_item_desc_zone;
+
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate. Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note: Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual. In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time. This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction. See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents. This gives:
+ * the inode getting the new extents: inode size
+ * the inode's bmap btree: max depth * block size
+ * the agfs of the ags from which the extents are allocated: 2 * sector
+ * the superblock free block counter: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ * the agfs of the ags containing the blocks: 2 * sector size
+ * the agfls of the ags containing the blocks: 2 * sector size
+ * the super block free block counter: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_write_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
+ (2 * mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+}
+
+/*
+ * In truncating a file we free up to two extents at once. We can modify:
+ * the inode being truncated: inode size
+ * the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ * the agf for each of the ags: 4 * sector size
+ * the agfl for each of the ags: 4 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming 4 extents:
+ * 4 exts * 2 trees * (2 * max depth - 1) * block size
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_itruncate_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
+ 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+ (4 * mp->m_sb.sb_sectsize +
+ 4 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 4) +
+ 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
+ 128 * 5 +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+}
+
+/*
+ * In renaming a files we can modify:
+ * the four inodes involved: 4 * inode size
+ * the two directory btrees: 2 * (max depth + v2) * dir block size
+ * the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ * of bmap blocks) giving:
+ * the agf for the ags in which the blocks live: 3 * sector size
+ * the agfl for the ags in which the blocks live: 3 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_rename_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((4 * mp->m_sb.sb_inodesize +
+ 2 * XFS_DIROP_LOG_RES(mp) +
+ 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
+ (3 * mp->m_sb.sb_sectsize +
+ 3 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 3) +
+ 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
+}
+
+/*
+ * For creating a link to an inode:
+ * the parent directory inode: inode size
+ * the linked inode: inode size
+ * the directory btree could split: (max depth + v2) * dir block size
+ * the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ * the agf for the ag in which the blocks live: sector size
+ * the agfl for the ag in which the blocks live: sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_link_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ XFS_DIROP_LOG_RES(mp) +
+ 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+ (mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+}
+
+/*
+ * For removing a directory entry we can modify:
+ * the parent directory inode: inode size
+ * the removed inode: inode size
+ * the directory btree could join: (max depth + v2) * dir block size
+ * the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ * the agf for the ag in which the blocks live: 2 * sector size
+ * the agfl for the ag in which the blocks live: 2 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_remove_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ XFS_DIROP_LOG_RES(mp) +
+ 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+ (2 * mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+}
+
+/*
+ * For symlink we can modify:
+ * the parent directory inode: inode size
+ * the new inode: inode size
+ * the inode btree entry: 1 block
+ * the directory btree: (max depth + v2) * dir block size
+ * the directory inode's bmap btree: (max depth + v2) * block size
+ * the blocks for the symlink: 1 kB
+ * Or in the first xact we allocate some inodes giving:
+ * the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, 1) +
+ XFS_DIROP_LOG_RES(mp) +
+ 1024 +
+ 128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
+ (2 * mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+ XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+}
+
+/*
+ * For create we can modify:
+ * the parent directory inode: inode size
+ * the new inode: inode size
+ * the inode btree entry: block size
+ * the superblock for the nlink flag: sector size
+ * the directory btree: (max depth + v2) * dir block size
+ * the directory inode's bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ * the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ * the superblock for the nlink flag: sector size
+ * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_create_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, 1) +
+ XFS_DIROP_LOG_RES(mp) +
+ 128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
+ (3 * mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+ XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+}
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+STATIC uint
+xfs_calc_mkdir_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_create_reservation(mp);
+}
+
+/*
+ * In freeing an inode we can modify:
+ * the inode being freed: inode size
+ * the super block free inode counter: sector size
+ * the agi hash list and counters: sector size
+ * the inode btree entry: block size
+ * the on disk inode before ours in the agi hash list: inode cluster size
+ * the inode btree: max depth * blocksize
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_ifree_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, 1) +
+ MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+ XFS_INODE_CLUSTER_SIZE(mp)) +
+ 128 * 5 +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+}
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+STATIC uint
+xfs_calc_ichange_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ 512;
+
+}
+
+/*
+ * Growing the data section of the filesystem.
+ * superblock
+ * agi and agf
+ * allocation btrees
+ */
+STATIC uint
+xfs_calc_growdata_reservation(
+ struct xfs_mount *mp)
+{
+ return mp->m_sb.sb_sectsize * 3 +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ * superblock: sector size
+ * agf of the ag from which the extent is allocated: sector size
+ * bmap btree for bitmap/summary inode: max depth * blocksize
+ * bitmap/summary inode: inode size
+ * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+STATIC uint
+xfs_calc_growrtalloc_reservation(
+ struct xfs_mount *mp)
+{
+ return 2 * mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+ mp->m_sb.sb_inodesize +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ * one bitmap/summary block: blocksize
+ */
+STATIC uint
+xfs_calc_growrtzero_reservation(
+ struct xfs_mount *mp)
+{
+ return mp->m_sb.sb_blocksize + 128;
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ * superblock: sector size
+ * bitmap inode: inode size
+ * summary inode: inode size
+ * one bitmap block: blocksize
+ * summary blocks: new summary size
+ */
+STATIC uint
+xfs_calc_growrtfree_reservation(
+ struct xfs_mount *mp)
+{
+ return mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_blocksize +
+ mp->m_rsumsize +
+ 128 * 5;
+}
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ * inode
+ */
+STATIC uint
+xfs_calc_swrite_reservation(
+ struct xfs_mount *mp)
+{
+ return mp->m_sb.sb_inodesize + 128;
+}
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ * inode
+ */
+STATIC uint
+xfs_calc_writeid_reservation(xfs_mount_t *mp)
+{
+ return mp->m_sb.sb_inodesize + 128;
+}
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ * the inode being converted: inode size
+ * agf block and superblock (for block allocation)
+ * the new block (directory sized)
+ * bmap blocks for the new directory block
+ * allocation btrees
+ */
+STATIC uint
+xfs_calc_addafork_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize * 2 +
+ mp->m_dirblksize +
+ XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
+ XFS_ALLOCFREE_LOG_RES(mp, 1) +
+ 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
+ XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+}
+
+/*
+ * Removing the attribute fork of a file
+ * the inode being truncated: inode size
+ * the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ * the agf for each of the ags: 4 * sector size
+ * the agfl for each of the ags: 4 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming 4 extents:
+ * 4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrinval_reservation(
+ struct xfs_mount *mp)
+{
+ return MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+ 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
+ (4 * mp->m_sb.sb_sectsize +
+ 4 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 4) +
+ 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
+}
+
+/*
+ * Setting an attribute.
+ * the inode getting the attribute
+ * the superblock for allocations
+ * the agfs extents are allocated from
+ * the attribute btree * max depth
+ * the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
+STATIC uint
+xfs_calc_attrset_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ mp->m_sb.sb_inodesize +
+ mp->m_sb.sb_sectsize +
+ XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+ 128 * (2 + XFS_DA_NODE_MAXDEPTH);
+}
+
+/*
+ * Removing an attribute.
+ * the inode: inode size
+ * the attribute btree could join: max depth * block size
+ * the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ * the agf for the ag in which the blocks live: 2 * sector size
+ * the agfl for the ag in which the blocks live: 2 * sector size
+ * the superblock for the free block count: sector size
+ * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrrm_reservation(
+ struct xfs_mount *mp)
+{
+ return XFS_DQUOT_LOGRES(mp) +
+ MAX((mp->m_sb.sb_inodesize +
+ XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+ XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+ 128 * (1 + XFS_DA_NODE_MAXDEPTH +
+ XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+ (2 * mp->m_sb.sb_sectsize +
+ 2 * mp->m_sb.sb_sectsize +
+ mp->m_sb.sb_sectsize +
+ XFS_ALLOCFREE_LOG_RES(mp, 2) +
+ 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+}
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+STATIC uint
+xfs_calc_clear_agi_bucket_reservation(
+ struct xfs_mount *mp)
+{
+ return mp->m_sb.sb_sectsize + 128;
+}
+
+/*
+ * Initialize the precomputed transaction reservation values
+ * in the mount structure.
+ */
+void
+xfs_trans_init(
+ struct xfs_mount *mp)
+{
+ struct xfs_trans_reservations *resp = &mp->m_reservations;
+
+ resp->tr_write = xfs_calc_write_reservation(mp);
+ resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
+ resp->tr_rename = xfs_calc_rename_reservation(mp);
+ resp->tr_link = xfs_calc_link_reservation(mp);
+ resp->tr_remove = xfs_calc_remove_reservation(mp);
+ resp->tr_symlink = xfs_calc_symlink_reservation(mp);
+ resp->tr_create = xfs_calc_create_reservation(mp);
+ resp->tr_mkdir = xfs_calc_mkdir_reservation(mp);
+ resp->tr_ifree = xfs_calc_ifree_reservation(mp);
+ resp->tr_ichange = xfs_calc_ichange_reservation(mp);
+ resp->tr_growdata = xfs_calc_growdata_reservation(mp);
+ resp->tr_swrite = xfs_calc_swrite_reservation(mp);
+ resp->tr_writeid = xfs_calc_writeid_reservation(mp);
+ resp->tr_addafork = xfs_calc_addafork_reservation(mp);
+ resp->tr_attrinval = xfs_calc_attrinval_reservation(mp);
+ resp->tr_attrset = xfs_calc_attrset_reservation(mp);
+ resp->tr_attrrm = xfs_calc_attrrm_reservation(mp);
+ resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp);
+ resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp);
+ resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp);
+ resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp);
+}
+
+/*
+ * This routine is called to allocate a transaction structure.
+ * The type parameter indicates the type of the transaction. These
+ * are enumerated in xfs_trans.h.
+ *
+ * Dynamically allocate the transaction structure from the transaction
+ * zone, initialize it, and return it to the caller.
+ */
+xfs_trans_t *
+xfs_trans_alloc(
+ xfs_mount_t *mp,
+ uint type)
+{
+ xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+ return _xfs_trans_alloc(mp, type, KM_SLEEP);
+}
+
+xfs_trans_t *
+_xfs_trans_alloc(
+ xfs_mount_t *mp,
+ uint type,
+ uint memflags)
+{
+ xfs_trans_t *tp;
+
+ atomic_inc(&mp->m_active_trans);
+
+ tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
+ tp->t_magic = XFS_TRANS_MAGIC;
+ tp->t_type = type;
+ tp->t_mountp = mp;
+ INIT_LIST_HEAD(&tp->t_items);
+ INIT_LIST_HEAD(&tp->t_busy);
+ return tp;
+}
+
+/*
+ * Free the transaction structure. If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+ struct xfs_trans *tp)
+{
+ xfs_alloc_busy_sort(&tp->t_busy);
+ xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
+
+ atomic_dec(&tp->t_mountp->m_active_trans);
+ xfs_trans_free_dqinfo(tp);
+ kmem_zone_free(xfs_trans_zone, tp);
+}
+
+/*
+ * This is called to create a new transaction which will share the
+ * permanent log reservation of the given transaction. The remaining
+ * unused block and rt extent reservations are also inherited. This
+ * implies that the original transaction is no longer allowed to allocate
+ * blocks. Locks and log items, however, are no inherited. They must
+ * be added to the new transaction explicitly.
+ */
+xfs_trans_t *
+xfs_trans_dup(
+ xfs_trans_t *tp)
+{
+ xfs_trans_t *ntp;
+
+ ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+
+ /*
+ * Initialize the new transaction structure.
+ */
+ ntp->t_magic = XFS_TRANS_MAGIC;
+ ntp->t_type = tp->t_type;
+ ntp->t_mountp = tp->t_mountp;
+ INIT_LIST_HEAD(&ntp->t_items);
+ INIT_LIST_HEAD(&ntp->t_busy);
+
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(tp->t_ticket != NULL);
+
+ ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
+ ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
+ ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
+ tp->t_blk_res = tp->t_blk_res_used;
+ ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
+ tp->t_rtx_res = tp->t_rtx_res_used;
+ ntp->t_pflags = tp->t_pflags;
+
+ xfs_trans_dup_dqinfo(tp, ntp);
+
+ atomic_inc(&tp->t_mountp->m_active_trans);
+ return ntp;
+}
+
+/*
+ * This is called to reserve free disk blocks and log space for the
+ * given transaction. This must be done before allocating any resources
+ * within the transaction.
+ *
+ * This will return ENOSPC if there are not enough blocks available.
+ * It will sleep waiting for available log space.
+ * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which
+ * is used by long running transactions. If any one of the reservations
+ * fails then they will all be backed out.
+ *
+ * This does not do quota reservations. That typically is done by the
+ * caller afterwards.
+ */
+int
+xfs_trans_reserve(
+ xfs_trans_t *tp,
+ uint blocks,
+ uint logspace,
+ uint rtextents,
+ uint flags,
+ uint logcount)
+{
+ int log_flags;
+ int error = 0;
+ int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+
+ /* Mark this thread as being in a transaction */
+ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+ /*
+ * Attempt to reserve the needed disk blocks by decrementing
+ * the number needed from the number available. This will
+ * fail if the count would go below zero.
+ */
+ if (blocks > 0) {
+ error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
+ -((int64_t)blocks), rsvd);
+ if (error != 0) {
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ return (XFS_ERROR(ENOSPC));
+ }
+ tp->t_blk_res += blocks;
+ }
+
+ /*
+ * Reserve the log space needed for this transaction.
+ */
+ if (logspace > 0) {
+ ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace));
+ ASSERT((tp->t_log_count == 0) ||
+ (tp->t_log_count == logcount));
+ if (flags & XFS_TRANS_PERM_LOG_RES) {
+ log_flags = XFS_LOG_PERM_RESERV;
+ tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+ } else {
+ ASSERT(tp->t_ticket == NULL);
+ ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+ log_flags = 0;
+ }
+
+ error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
+ &tp->t_ticket,
+ XFS_TRANSACTION, log_flags, tp->t_type);
+ if (error) {
+ goto undo_blocks;
+ }
+ tp->t_log_res = logspace;
+ tp->t_log_count = logcount;
+ }
+
+ /*
+ * Attempt to reserve the needed realtime extents by decrementing
+ * the number needed from the number available. This will
+ * fail if the count would go below zero.
+ */
+ if (rtextents > 0) {
+ error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
+ -((int64_t)rtextents), rsvd);
+ if (error) {
+ error = XFS_ERROR(ENOSPC);
+ goto undo_log;
+ }
+ tp->t_rtx_res += rtextents;
+ }
+
+ return 0;
+
+ /*
+ * Error cases jump to one of these labels to undo any
+ * reservations which have already been performed.
+ */
+undo_log:
+ if (logspace > 0) {
+ if (flags & XFS_TRANS_PERM_LOG_RES) {
+ log_flags = XFS_LOG_REL_PERM_RESERV;
+ } else {
+ log_flags = 0;
+ }
+ xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+ tp->t_ticket = NULL;
+ tp->t_log_res = 0;
+ tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
+ }
+
+undo_blocks:
+ if (blocks > 0) {
+ xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
+ (int64_t)blocks, rsvd);
+ tp->t_blk_res = 0;
+ }
+
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+ return error;
+}
+
+/*
+ * Record the indicated change to the given field for application
+ * to the file system's superblock when the transaction commits.
+ * For now, just store the change in the transaction structure.
+ *
+ * Mark the transaction structure to indicate that the superblock
+ * needs to be updated before committing.
+ *
+ * Because we may not be keeping track of allocated/free inodes and
+ * used filesystem blocks in the superblock, we do not mark the
+ * superblock dirty in this transaction if we modify these fields.
+ * We still need to update the transaction deltas so that they get
+ * applied to the incore superblock, but we don't want them to
+ * cause the superblock to get locked and logged if these are the
+ * only fields in the superblock that the transaction modifies.
+ */
+void
+xfs_trans_mod_sb(
+ xfs_trans_t *tp,
+ uint field,
+ int64_t delta)
+{
+ uint32_t flags = (XFS_TRANS_DIRTY|XFS_TRANS_SB_DIRTY);
+ xfs_mount_t *mp = tp->t_mountp;
+
+ switch (field) {
+ case XFS_TRANS_SB_ICOUNT:
+ tp->t_icount_delta += delta;
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ flags &= ~XFS_TRANS_SB_DIRTY;
+ break;
+ case XFS_TRANS_SB_IFREE:
+ tp->t_ifree_delta += delta;
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ flags &= ~XFS_TRANS_SB_DIRTY;
+ break;
+ case XFS_TRANS_SB_FDBLOCKS:
+ /*
+ * Track the number of blocks allocated in the
+ * transaction. Make sure it does not exceed the
+ * number reserved.
+ */
+ if (delta < 0) {
+ tp->t_blk_res_used += (uint)-delta;
+ ASSERT(tp->t_blk_res_used <= tp->t_blk_res);
+ }
+ tp->t_fdblocks_delta += delta;
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ flags &= ~XFS_TRANS_SB_DIRTY;
+ break;
+ case XFS_TRANS_SB_RES_FDBLOCKS:
+ /*
+ * The allocation has already been applied to the
+ * in-core superblock's counter. This should only
+ * be applied to the on-disk superblock.
+ */
+ ASSERT(delta < 0);
+ tp->t_res_fdblocks_delta += delta;
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ flags &= ~XFS_TRANS_SB_DIRTY;
+ break;
+ case XFS_TRANS_SB_FREXTENTS:
+ /*
+ * Track the number of blocks allocated in the
+ * transaction. Make sure it does not exceed the
+ * number reserved.
+ */
+ if (delta < 0) {
+ tp->t_rtx_res_used += (uint)-delta;
+ ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res);
+ }
+ tp->t_frextents_delta += delta;
+ break;
+ case XFS_TRANS_SB_RES_FREXTENTS:
+ /*
+ * The allocation has already been applied to the
+ * in-core superblock's counter. This should only
+ * be applied to the on-disk superblock.
+ */
+ ASSERT(delta < 0);
+ tp->t_res_frextents_delta += delta;
+ break;
+ case XFS_TRANS_SB_DBLOCKS:
+ ASSERT(delta > 0);
+ tp->t_dblocks_delta += delta;
+ break;
+ case XFS_TRANS_SB_AGCOUNT:
+ ASSERT(delta > 0);
+ tp->t_agcount_delta += delta;
+ break;
+ case XFS_TRANS_SB_IMAXPCT:
+ tp->t_imaxpct_delta += delta;
+ break;
+ case XFS_TRANS_SB_REXTSIZE:
+ tp->t_rextsize_delta += delta;
+ break;
+ case XFS_TRANS_SB_RBMBLOCKS:
+ tp->t_rbmblocks_delta += delta;
+ break;
+ case XFS_TRANS_SB_RBLOCKS:
+ tp->t_rblocks_delta += delta;
+ break;
+ case XFS_TRANS_SB_REXTENTS:
+ tp->t_rextents_delta += delta;
+ break;
+ case XFS_TRANS_SB_REXTSLOG:
+ tp->t_rextslog_delta += delta;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ tp->t_flags |= flags;
+}
+
+/*
+ * xfs_trans_apply_sb_deltas() is called from the commit code
+ * to bring the superblock buffer into the current transaction
+ * and modify it as requested by earlier calls to xfs_trans_mod_sb().
+ *
+ * For now we just look at each field allowed to change and change
+ * it if necessary.
+ */
+STATIC void
+xfs_trans_apply_sb_deltas(
+ xfs_trans_t *tp)
+{
+ xfs_dsb_t *sbp;
+ xfs_buf_t *bp;
+ int whole = 0;
+
+ bp = xfs_trans_getsb(tp, tp->t_mountp, 0);
+ sbp = XFS_BUF_TO_SBP(bp);
+
+ /*
+ * Check that superblock mods match the mods made to AGF counters.
+ */
+ ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) ==
+ (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta +
+ tp->t_ag_btree_delta));
+
+ /*
+ * Only update the superblock counters if we are logging them
+ */
+ if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) {
+ if (tp->t_icount_delta)
+ be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta);
+ if (tp->t_ifree_delta)
+ be64_add_cpu(&sbp->sb_ifree, tp->t_ifree_delta);
+ if (tp->t_fdblocks_delta)
+ be64_add_cpu(&sbp->sb_fdblocks, tp->t_fdblocks_delta);
+ if (tp->t_res_fdblocks_delta)
+ be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
+ }
+
+ if (tp->t_frextents_delta)
+ be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
+ if (tp->t_res_frextents_delta)
+ be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
+
+ if (tp->t_dblocks_delta) {
+ be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
+ whole = 1;
+ }
+ if (tp->t_agcount_delta) {
+ be32_add_cpu(&sbp->sb_agcount, tp->t_agcount_delta);
+ whole = 1;
+ }
+ if (tp->t_imaxpct_delta) {
+ sbp->sb_imax_pct += tp->t_imaxpct_delta;
+ whole = 1;
+ }
+ if (tp->t_rextsize_delta) {
+ be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta);
+ whole = 1;
+ }
+ if (tp->t_rbmblocks_delta) {
+ be32_add_cpu(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta);
+ whole = 1;
+ }
+ if (tp->t_rblocks_delta) {
+ be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta);
+ whole = 1;
+ }
+ if (tp->t_rextents_delta) {
+ be64_add_cpu(&sbp->sb_rextents, tp->t_rextents_delta);
+ whole = 1;
+ }
+ if (tp->t_rextslog_delta) {
+ sbp->sb_rextslog += tp->t_rextslog_delta;
+ whole = 1;
+ }
+
+ if (whole)
+ /*
+ * Log the whole thing, the fields are noncontiguous.
+ */
+ xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1);
+ else
+ /*
+ * Since all the modifiable fields are contiguous, we
+ * can get away with this.
+ */
+ xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
+ offsetof(xfs_dsb_t, sb_frextents) +
+ sizeof(sbp->sb_frextents) - 1);
+}
+
+/*
+ * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
+ * and apply superblock counter changes to the in-core superblock. The
+ * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT
+ * applied to the in-core superblock. The idea is that that has already been
+ * done.
+ *
+ * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
+ * However, we have to ensure that we only modify each superblock field only
+ * once because the application of the delta values may not be atomic. That can
+ * lead to ENOSPC races occurring if we have two separate modifcations of the
+ * free space counter to put back the entire reservation and then take away
+ * what we used.
+ *
+ * If we are not logging superblock counters, then the inode allocated/free and
+ * used block counts are not updated in the on disk superblock. In this case,
+ * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
+ * still need to update the incore superblock with the changes.
+ */
+void
+xfs_trans_unreserve_and_mod_sb(
+ xfs_trans_t *tp)
+{
+ xfs_mod_sb_t msb[9]; /* If you add cases, add entries */
+ xfs_mod_sb_t *msbp;
+ xfs_mount_t *mp = tp->t_mountp;
+ /* REFERENCED */
+ int error;
+ int rsvd;
+ int64_t blkdelta = 0;
+ int64_t rtxdelta = 0;
+ int64_t idelta = 0;
+ int64_t ifreedelta = 0;
+
+ msbp = msb;
+ rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+
+ /* calculate deltas */
+ if (tp->t_blk_res > 0)
+ blkdelta = tp->t_blk_res;
+ if ((tp->t_fdblocks_delta != 0) &&
+ (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+ (tp->t_flags & XFS_TRANS_SB_DIRTY)))
+ blkdelta += tp->t_fdblocks_delta;
+
+ if (tp->t_rtx_res > 0)
+ rtxdelta = tp->t_rtx_res;
+ if ((tp->t_frextents_delta != 0) &&
+ (tp->t_flags & XFS_TRANS_SB_DIRTY))
+ rtxdelta += tp->t_frextents_delta;
+
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb) ||
+ (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
+ idelta = tp->t_icount_delta;
+ ifreedelta = tp->t_ifree_delta;
+ }
+
+ /* apply the per-cpu counters */
+ if (blkdelta) {
+ error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+ blkdelta, rsvd);
+ if (error)
+ goto out;
+ }
+
+ if (idelta) {
+ error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
+ idelta, rsvd);
+ if (error)
+ goto out_undo_fdblocks;
+ }
+
+ if (ifreedelta) {
+ error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
+ ifreedelta, rsvd);
+ if (error)
+ goto out_undo_icount;
+ }
+
+ /* apply remaining deltas */
+ if (rtxdelta != 0) {
+ msbp->msb_field = XFS_SBS_FREXTENTS;
+ msbp->msb_delta = rtxdelta;
+ msbp++;
+ }
+
+ if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+ if (tp->t_dblocks_delta != 0) {
+ msbp->msb_field = XFS_SBS_DBLOCKS;
+ msbp->msb_delta = tp->t_dblocks_delta;
+ msbp++;
+ }
+ if (tp->t_agcount_delta != 0) {
+ msbp->msb_field = XFS_SBS_AGCOUNT;
+ msbp->msb_delta = tp->t_agcount_delta;
+ msbp++;
+ }
+ if (tp->t_imaxpct_delta != 0) {
+ msbp->msb_field = XFS_SBS_IMAX_PCT;
+ msbp->msb_delta = tp->t_imaxpct_delta;
+ msbp++;
+ }
+ if (tp->t_rextsize_delta != 0) {
+ msbp->msb_field = XFS_SBS_REXTSIZE;
+ msbp->msb_delta = tp->t_rextsize_delta;
+ msbp++;
+ }
+ if (tp->t_rbmblocks_delta != 0) {
+ msbp->msb_field = XFS_SBS_RBMBLOCKS;
+ msbp->msb_delta = tp->t_rbmblocks_delta;
+ msbp++;
+ }
+ if (tp->t_rblocks_delta != 0) {
+ msbp->msb_field = XFS_SBS_RBLOCKS;
+ msbp->msb_delta = tp->t_rblocks_delta;
+ msbp++;
+ }
+ if (tp->t_rextents_delta != 0) {
+ msbp->msb_field = XFS_SBS_REXTENTS;
+ msbp->msb_delta = tp->t_rextents_delta;
+ msbp++;
+ }
+ if (tp->t_rextslog_delta != 0) {
+ msbp->msb_field = XFS_SBS_REXTSLOG;
+ msbp->msb_delta = tp->t_rextslog_delta;
+ msbp++;
+ }
+ }
+
+ /*
+ * If we need to change anything, do it.
+ */
+ if (msbp > msb) {
+ error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
+ (uint)(msbp - msb), rsvd);
+ if (error)
+ goto out_undo_ifreecount;
+ }
+
+ return;
+
+out_undo_ifreecount:
+ if (ifreedelta)
+ xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
+out_undo_icount:
+ if (idelta)
+ xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
+out_undo_fdblocks:
+ if (blkdelta)
+ xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
+out:
+ ASSERT(error == 0);
+ return;
+}
+
+/*
+ * Add the given log item to the transaction's list of log items.
+ *
+ * The log item will now point to its new descriptor with its li_desc field.
+ */
+void
+xfs_trans_add_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_log_item_desc *lidp;
+
+ ASSERT(lip->li_mountp = tp->t_mountp);
+ ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
+
+ lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
+
+ lidp->lid_item = lip;
+ lidp->lid_flags = 0;
+ lidp->lid_size = 0;
+ list_add_tail(&lidp->lid_trans, &tp->t_items);
+
+ lip->li_desc = lidp;
+}
+
+STATIC void
+xfs_trans_free_item_desc(
+ struct xfs_log_item_desc *lidp)
+{
+ list_del_init(&lidp->lid_trans);
+ kmem_zone_free(xfs_log_item_desc_zone, lidp);
+}
+
+/*
+ * Unlink and free the given descriptor.
+ */
+void
+xfs_trans_del_item(
+ struct xfs_log_item *lip)
+{
+ xfs_trans_free_item_desc(lip->li_desc);
+ lip->li_desc = NULL;
+}
+
+/*
+ * Unlock all of the items of a transaction and free all the descriptors
+ * of that transaction.
+ */
+void
+xfs_trans_free_items(
+ struct xfs_trans *tp,
+ xfs_lsn_t commit_lsn,
+ int flags)
+{
+ struct xfs_log_item_desc *lidp, *next;
+
+ list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+ struct xfs_log_item *lip = lidp->lid_item;
+
+ lip->li_desc = NULL;
+
+ if (commit_lsn != NULLCOMMITLSN)
+ IOP_COMMITTING(lip, commit_lsn);
+ if (flags & XFS_TRANS_ABORT)
+ lip->li_flags |= XFS_LI_ABORTED;
+ IOP_UNLOCK(lip);
+
+ xfs_trans_free_item_desc(lidp);
+ }
+}
+
+/*
+ * Unlock the items associated with a transaction.
+ *
+ * Items which were not logged should be freed. Those which were logged must
+ * still be tracked so they can be unpinned when the transaction commits.
+ */
+STATIC void
+xfs_trans_unlock_items(
+ struct xfs_trans *tp,
+ xfs_lsn_t commit_lsn)
+{
+ struct xfs_log_item_desc *lidp, *next;
+
+ list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+ struct xfs_log_item *lip = lidp->lid_item;
+
+ lip->li_desc = NULL;
+
+ if (commit_lsn != NULLCOMMITLSN)
+ IOP_COMMITTING(lip, commit_lsn);
+ IOP_UNLOCK(lip);
+
+ /*
+ * Free the descriptor if the item is not dirty
+ * within this transaction.
+ */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ xfs_trans_free_item_desc(lidp);
+ }
+}
+
+/*
+ * Total up the number of log iovecs needed to commit this
+ * transaction. The transaction itself needs one for the
+ * transaction header. Ask each dirty item in turn how many
+ * it needs to get the total.
+ */
+static uint
+xfs_trans_count_vecs(
+ struct xfs_trans *tp)
+{
+ int nvecs;
+ struct xfs_log_item_desc *lidp;
+
+ nvecs = 1;
+
+ /* In the non-debug case we need to start bailing out if we
+ * didn't find a log_item here, return zero and let trans_commit
+ * deal with it.
+ */
+ if (list_empty(&tp->t_items)) {
+ ASSERT(0);
+ return 0;
+ }
+
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ /*
+ * Skip items which aren't dirty in this transaction.
+ */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ continue;
+ lidp->lid_size = IOP_SIZE(lidp->lid_item);
+ nvecs += lidp->lid_size;
+ }
+
+ return nvecs;
+}
+
+/*
+ * Fill in the vector with pointers to data to be logged
+ * by this transaction. The transaction header takes
+ * the first vector, and then each dirty item takes the
+ * number of vectors it indicated it needed in xfs_trans_count_vecs().
+ *
+ * As each item fills in the entries it needs, also pin the item
+ * so that it cannot be flushed out until the log write completes.
+ */
+static void
+xfs_trans_fill_vecs(
+ struct xfs_trans *tp,
+ struct xfs_log_iovec *log_vector)
+{
+ struct xfs_log_item_desc *lidp;
+ struct xfs_log_iovec *vecp;
+ uint nitems;
+
+ /*
+ * Skip over the entry for the transaction header, we'll
+ * fill that in at the end.
+ */
+ vecp = log_vector + 1;
+
+ nitems = 0;
+ ASSERT(!list_empty(&tp->t_items));
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ /* Skip items which aren't dirty in this transaction. */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ continue;
+
+ /*
+ * The item may be marked dirty but not log anything. This can
+ * be used to get called when a transaction is committed.
+ */
+ if (lidp->lid_size)
+ nitems++;
+ IOP_FORMAT(lidp->lid_item, vecp);
+ vecp += lidp->lid_size;
+ IOP_PIN(lidp->lid_item);
+ }
+
+ /*
+ * Now that we've counted the number of items in this transaction, fill
+ * in the transaction header. Note that the transaction header does not
+ * have a log item.
+ */
+ tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+ tp->t_header.th_type = tp->t_type;
+ tp->t_header.th_num_items = nitems;
+ log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+ log_vector->i_len = sizeof(xfs_trans_header_t);
+ log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+}
+
+/*
+ * The committed item processing consists of calling the committed routine of
+ * each logged item, updating the item's position in the AIL if necessary, and
+ * unpinning each item. If the committed routine returns -1, then do nothing
+ * further with the item because it may have been freed.
+ *
+ * Since items are unlocked when they are copied to the incore log, it is
+ * possible for two transactions to be completing and manipulating the same
+ * item simultaneously. The AIL lock will protect the lsn field of each item.
+ * The value of this field can never go backwards.
+ *
+ * We unpin the items after repositioning them in the AIL, because otherwise
+ * they could be immediately flushed and we'd have to race with the flusher
+ * trying to pull the item from the AIL as we add it.
+ */
+static void
+xfs_trans_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t commit_lsn,
+ int aborted)
+{
+ xfs_lsn_t item_lsn;
+ struct xfs_ail *ailp;
+
+ if (aborted)
+ lip->li_flags |= XFS_LI_ABORTED;
+ item_lsn = IOP_COMMITTED(lip, commit_lsn);
+
+ /* item_lsn of -1 means the item needs no further processing */
+ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+ return;
+
+ /*
+ * If the returned lsn is greater than what it contained before, update
+ * the location of the item in the AIL. If it is not, then do nothing.
+ * Items can never move backwards in the AIL.
+ *
+ * While the new lsn should usually be greater, it is possible that a
+ * later transaction completing simultaneously with an earlier one
+ * using the same item could complete first with a higher lsn. This
+ * would cause the earlier transaction to fail the test below.
+ */
+ ailp = lip->li_ailp;
+ spin_lock(&ailp->xa_lock);
+ if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
+ /*
+ * This will set the item's lsn to item_lsn and update the
+ * position of the item in the AIL.
+ *
+ * xfs_trans_ail_update() drops the AIL lock.
+ */
+ xfs_trans_ail_update(ailp, lip, item_lsn);
+ } else {
+ spin_unlock(&ailp->xa_lock);
+ }
+
+ /*
+ * Now that we've repositioned the item in the AIL, unpin it so it can
+ * be flushed. Pass information about buffer stale state down from the
+ * log item flags, if anyone else stales the buffer we do not want to
+ * pay any attention to it.
+ */
+ IOP_UNPIN(lip, 0);
+}
+
+/*
+ * This is typically called by the LM when a transaction has been fully
+ * committed to disk. It needs to unpin the items which have
+ * been logged by the transaction and update their positions
+ * in the AIL if necessary.
+ *
+ * This also gets called when the transactions didn't get written out
+ * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
+ */
+STATIC void
+xfs_trans_committed(
+ void *arg,
+ int abortflag)
+{
+ struct xfs_trans *tp = arg;
+ struct xfs_log_item_desc *lidp, *next;
+
+ list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
+ xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
+ xfs_trans_free_item_desc(lidp);
+ }
+
+ xfs_trans_free(tp);
+}
+
+static inline void
+xfs_log_item_batch_insert(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ struct xfs_log_item **log_items,
+ int nr_items,
+ xfs_lsn_t commit_lsn)
+{
+ int i;
+
+ spin_lock(&ailp->xa_lock);
+ /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
+ xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
+
+ for (i = 0; i < nr_items; i++)
+ IOP_UNPIN(log_items[i], 0);
+}
+
+/*
+ * Bulk operation version of xfs_trans_committed that takes a log vector of
+ * items to insert into the AIL. This uses bulk AIL insertion techniques to
+ * minimise lock traffic.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * means that checkpoint commit abort handling is treated exactly the same
+ * as an iclog write error even though we haven't started any IO yet. Hence in
+ * this case all we need to do is IOP_COMMITTED processing, followed by an
+ * IOP_UNPIN(aborted) call.
+ *
+ * The AIL cursor is used to optimise the insert process. If commit_lsn is not
+ * at the end of the AIL, the insert cursor avoids the need to walk
+ * the AIL to find the insertion point on every xfs_log_item_batch_insert()
+ * call. This saves a lot of needless list walking and is a net win, even
+ * though it slightly increases that amount of AIL lock traffic to set it up
+ * and tear it down.
+ */
+void
+xfs_trans_committed_bulk(
+ struct xfs_ail *ailp,
+ struct xfs_log_vec *log_vector,
+ xfs_lsn_t commit_lsn,
+ int aborted)
+{
+#define LOG_ITEM_BATCH_SIZE 32
+ struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
+ struct xfs_log_vec *lv;
+ struct xfs_ail_cursor cur;
+ int i = 0;
+
+ spin_lock(&ailp->xa_lock);
+ xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn);
+ spin_unlock(&ailp->xa_lock);
+
+ /* unpin all the log items */
+ for (lv = log_vector; lv; lv = lv->lv_next ) {
+ struct xfs_log_item *lip = lv->lv_item;
+ xfs_lsn_t item_lsn;
+
+ if (aborted)
+ lip->li_flags |= XFS_LI_ABORTED;
+ item_lsn = IOP_COMMITTED(lip, commit_lsn);
+
+ /* item_lsn of -1 means the item needs no further processing */
+ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+ continue;
+
+ /*
+ * if we are aborting the operation, no point in inserting the
+ * object into the AIL as we are in a shutdown situation.
+ */
+ if (aborted) {
+ ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
+ IOP_UNPIN(lip, 1);
+ continue;
+ }
+
+ if (item_lsn != commit_lsn) {
+
+ /*
+ * Not a bulk update option due to unusual item_lsn.
+ * Push into AIL immediately, rechecking the lsn once
+ * we have the ail lock. Then unpin the item. This does
+ * not affect the AIL cursor the bulk insert path is
+ * using.
+ */
+ spin_lock(&ailp->xa_lock);
+ if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+ xfs_trans_ail_update(ailp, lip, item_lsn);
+ else
+ spin_unlock(&ailp->xa_lock);
+ IOP_UNPIN(lip, 0);
+ continue;
+ }
+
+ /* Item is a candidate for bulk AIL insert. */
+ log_items[i++] = lv->lv_item;
+ if (i >= LOG_ITEM_BATCH_SIZE) {
+ xfs_log_item_batch_insert(ailp, &cur, log_items,
+ LOG_ITEM_BATCH_SIZE, commit_lsn);
+ i = 0;
+ }
+ }
+
+ /* make sure we insert the remainder! */
+ if (i)
+ xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn);
+
+ spin_lock(&ailp->xa_lock);
+ xfs_trans_ail_cursor_done(ailp, &cur);
+ spin_unlock(&ailp->xa_lock);
+}
+
+/*
+ * Called from the trans_commit code when we notice that the filesystem is in
+ * the middle of a forced shutdown.
+ *
+ * When we are called here, we have already pinned all the items in the
+ * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
+ * so we can simply walk the items in the transaction, unpin them with an abort
+ * flag and then free the items. Note that unpinning the items can result in
+ * them being freed immediately, so we need to use a safe list traversal method
+ * here.
+ */
+STATIC void
+xfs_trans_uncommit(
+ struct xfs_trans *tp,
+ uint flags)
+{
+ struct xfs_log_item_desc *lidp, *n;
+
+ list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
+ if (lidp->lid_flags & XFS_LID_DIRTY)
+ IOP_UNPIN(lidp->lid_item, 1);
+ }
+
+ xfs_trans_unreserve_and_mod_sb(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp);
+
+ xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+ xfs_trans_free(tp);
+}
+
+/*
+ * Format the transaction direct to the iclog. This isolates the physical
+ * transaction commit operation from the logical operation and hence allows
+ * other methods to be introduced without affecting the existing commit path.
+ */
+static int
+xfs_trans_commit_iclog(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_lsn_t *commit_lsn,
+ int flags)
+{
+ int shutdown;
+ int error;
+ int log_flags = 0;
+ struct xlog_in_core *commit_iclog;
+#define XFS_TRANS_LOGVEC_COUNT 16
+ struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
+ struct xfs_log_iovec *log_vector;
+ uint nvec;
+
+
+ /*
+ * Ask each log item how many log_vector entries it will
+ * need so we can figure out how many to allocate.
+ * Try to avoid the kmem_alloc() call in the common case
+ * by using a vector from the stack when it fits.
+ */
+ nvec = xfs_trans_count_vecs(tp);
+ if (nvec == 0) {
+ return ENOMEM; /* triggers a shutdown! */
+ } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
+ log_vector = log_vector_fast;
+ } else {
+ log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec *
+ sizeof(xfs_log_iovec_t),
+ KM_SLEEP);
+ }
+
+ /*
+ * Fill in the log_vector and pin the logged items, and
+ * then write the transaction to the log.
+ */
+ xfs_trans_fill_vecs(tp, log_vector);
+
+ if (flags & XFS_TRANS_RELEASE_LOG_RES)
+ log_flags = XFS_LOG_REL_PERM_RESERV;
+
+ error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
+
+ /*
+ * The transaction is committed incore here, and can go out to disk
+ * at any time after this call. However, all the items associated
+ * with the transaction are still locked and pinned in memory.
+ */
+ *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
+
+ tp->t_commit_lsn = *commit_lsn;
+ trace_xfs_trans_commit_lsn(tp);
+
+ if (nvec > XFS_TRANS_LOGVEC_COUNT)
+ kmem_free(log_vector);
+
+ /*
+ * If we got a log write error. Unpin the logitems that we
+ * had pinned, clean up, free trans structure, and return error.
+ */
+ if (error || *commit_lsn == -1) {
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
+ return XFS_ERROR(EIO);
+ }
+
+ /*
+ * Once the transaction has committed, unused
+ * reservations need to be released and changes to
+ * the superblock need to be reflected in the in-core
+ * version. Do that now.
+ */
+ xfs_trans_unreserve_and_mod_sb(tp);
+
+ /*
+ * Tell the LM to call the transaction completion routine
+ * when the log write with LSN commit_lsn completes (e.g.
+ * when the transaction commit really hits the on-disk log).
+ * After this call we cannot reference tp, because the call
+ * can happen at any time and the call will free the transaction
+ * structure pointed to by tp. The only case where we call
+ * the completion routine (xfs_trans_committed) directly is
+ * if the log is turned off on a debug kernel or we're
+ * running in simulation mode (the log is explicitly turned
+ * off).
+ */
+ tp->t_logcb.cb_func = xfs_trans_committed;
+ tp->t_logcb.cb_arg = tp;
+
+ /*
+ * We need to pass the iclog buffer which was used for the
+ * transaction commit record into this function, and attach
+ * the callback to it. The callback must be attached before
+ * the items are unlocked to avoid racing with other threads
+ * waiting for an item to unlock.
+ */
+ shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb));
+
+ /*
+ * Mark this thread as no longer being in a transaction
+ */
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+ /*
+ * Once all the items of the transaction have been copied
+ * to the in core log and the callback is attached, the
+ * items can be unlocked.
+ *
+ * This will free descriptors pointing to items which were
+ * not logged since there is nothing more to do with them.
+ * For items which were logged, we will keep pointers to them
+ * so they can be unpinned after the transaction commits to disk.
+ * This will also stamp each modified meta-data item with
+ * the commit lsn of this transaction for dependency tracking
+ * purposes.
+ */
+ xfs_trans_unlock_items(tp, *commit_lsn);
+
+ /*
+ * If we detected a log error earlier, finish committing
+ * the transaction now (unpin log items, etc).
+ *
+ * Order is critical here, to avoid using the transaction
+ * pointer after its been freed (by xfs_trans_committed
+ * either here now, or as a callback). We cannot do this
+ * step inside xfs_log_notify as was done earlier because
+ * of this issue.
+ */
+ if (shutdown)
+ xfs_trans_committed(tp, XFS_LI_ABORTED);
+
+ /*
+ * Now that the xfs_trans_committed callback has been attached,
+ * and the items are released we can finally allow the iclog to
+ * go to disk.
+ */
+ return xfs_log_release_iclog(mp, commit_iclog);
+}
+
+/*
+ * Walk the log items and allocate log vector structures for
+ * each item large enough to fit all the vectors they require.
+ * Note that this format differs from the old log vector format in
+ * that there is no transaction header in these log vectors.
+ */
+STATIC struct xfs_log_vec *
+xfs_trans_alloc_log_vecs(
+ xfs_trans_t *tp)
+{
+ struct xfs_log_item_desc *lidp;
+ struct xfs_log_vec *lv = NULL;
+ struct xfs_log_vec *ret_lv = NULL;
+
+
+ /* Bail out if we didn't find a log item. */
+ if (list_empty(&tp->t_items)) {
+ ASSERT(0);
+ return NULL;
+ }
+
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ struct xfs_log_vec *new_lv;
+
+ /* Skip items which aren't dirty in this transaction. */
+ if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ continue;
+
+ /* Skip items that do not have any vectors for writing */
+ lidp->lid_size = IOP_SIZE(lidp->lid_item);
+ if (!lidp->lid_size)
+ continue;
+
+ new_lv = kmem_zalloc(sizeof(*new_lv) +
+ lidp->lid_size * sizeof(struct xfs_log_iovec),
+ KM_SLEEP);
+
+ /* The allocated iovec region lies beyond the log vector. */
+ new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+ new_lv->lv_niovecs = lidp->lid_size;
+ new_lv->lv_item = lidp->lid_item;
+ if (!ret_lv)
+ ret_lv = new_lv;
+ else
+ lv->lv_next = new_lv;
+ lv = new_lv;
+ }
+
+ return ret_lv;
+}
+
+static int
+xfs_trans_commit_cil(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_lsn_t *commit_lsn,
+ int flags)
+{
+ struct xfs_log_vec *log_vector;
+
+ /*
+ * Get each log item to allocate a vector structure for
+ * the log item to to pass to the log write code. The
+ * CIL commit code will format the vector and save it away.
+ */
+ log_vector = xfs_trans_alloc_log_vecs(tp);
+ if (!log_vector)
+ return ENOMEM;
+
+ xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
+
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ xfs_trans_free(tp);
+ return 0;
+}
+
+/*
+ * xfs_trans_commit
+ *
+ * Commit the given transaction to the log a/synchronously.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * Do not reference the transaction structure after this call.
+ */
+int
+_xfs_trans_commit(
+ struct xfs_trans *tp,
+ uint flags,
+ int *log_flushed)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_lsn_t commit_lsn = -1;
+ int error = 0;
+ int log_flags = 0;
+ int sync = tp->t_flags & XFS_TRANS_SYNC;
+
+ /*
+ * Determine whether this commit is releasing a permanent
+ * log reservation or not.
+ */
+ if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ log_flags = XFS_LOG_REL_PERM_RESERV;
+ }
+
+ /*
+ * If there is nothing to be logged by the transaction,
+ * then unlock all of the items associated with the
+ * transaction and free the transaction structure.
+ * Also make sure to return any reserved blocks to
+ * the free pool.
+ */
+ if (!(tp->t_flags & XFS_TRANS_DIRTY))
+ goto out_unreserve;
+
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ error = XFS_ERROR(EIO);
+ goto out_unreserve;
+ }
+
+ ASSERT(tp->t_ticket != NULL);
+
+ /*
+ * If we need to update the superblock, then do it now.
+ */
+ if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+ xfs_trans_apply_sb_deltas(tp);
+ xfs_trans_apply_dquot_deltas(tp);
+
+ if (mp->m_flags & XFS_MOUNT_DELAYLOG)
+ error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
+ else
+ error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+
+ if (error == ENOMEM) {
+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ error = XFS_ERROR(EIO);
+ goto out_unreserve;
+ }
+
+ /*
+ * If the transaction needs to be synchronous, then force the
+ * log out now and wait for it.
+ */
+ if (sync) {
+ if (!error) {
+ error = _xfs_log_force_lsn(mp, commit_lsn,
+ XFS_LOG_SYNC, log_flushed);
+ }
+ XFS_STATS_INC(xs_trans_sync);
+ } else {
+ XFS_STATS_INC(xs_trans_async);
+ }
+
+ return error;
+
+out_unreserve:
+ xfs_trans_unreserve_and_mod_sb(tp);
+
+ /*
+ * It is indeed possible for the transaction to be not dirty but
+ * the dqinfo portion to be. All that means is that we have some
+ * (non-persistent) quota reservations that need to be unreserved.
+ */
+ xfs_trans_unreserve_and_mod_dquots(tp);
+ if (tp->t_ticket) {
+ commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ if (commit_lsn == -1 && !error)
+ error = XFS_ERROR(EIO);
+ }
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+ xfs_trans_free(tp);
+
+ XFS_STATS_INC(xs_trans_empty);
+ return error;
+}
+
+/*
+ * Unlock all of the transaction's items and free the transaction.
+ * The transaction must not have modified any of its items, because
+ * there is no way to restore them to their previous state.
+ *
+ * If the transaction has made a log reservation, make sure to release
+ * it as well.
+ */
+void
+xfs_trans_cancel(
+ xfs_trans_t *tp,
+ int flags)
+{
+ int log_flags;
+ xfs_mount_t *mp = tp->t_mountp;
+
+ /*
+ * See if the caller is being too lazy to figure out if
+ * the transaction really needs an abort.
+ */
+ if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
+ flags &= ~XFS_TRANS_ABORT;
+ /*
+ * See if the caller is relying on us to shut down the
+ * filesystem. This happens in paths where we detect
+ * corruption and decide to give up.
+ */
+ if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
+ XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+#ifdef DEBUG
+ if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
+ struct xfs_log_item_desc *lidp;
+
+ list_for_each_entry(lidp, &tp->t_items, lid_trans)
+ ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
+ }
+#endif
+ xfs_trans_unreserve_and_mod_sb(tp);
+ xfs_trans_unreserve_and_mod_dquots(tp);
+
+ if (tp->t_ticket) {
+ if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ log_flags = XFS_LOG_REL_PERM_RESERV;
+ } else {
+ log_flags = 0;
+ }
+ xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ }
+
+ /* mark this thread as no longer being in a transaction */
+ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+
+ xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+ xfs_trans_free(tp);
+}
+
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to
+ * the next: permanent transactions are only flushed out when
+ * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * as possible to let chunks of it go to the log. So we commit the
+ * chunk we've been working on and get a new transaction to continue.
+ */
+int
+xfs_trans_roll(
+ struct xfs_trans **tpp,
+ struct xfs_inode *dp)
+{
+ struct xfs_trans *trans;
+ unsigned int logres, count;
+ int error;
+
+ /*
+ * Ensure that the inode is always logged.
+ */
+ trans = *tpp;
+ xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+
+ /*
+ * Copy the critical parameters from one trans to the next.
+ */
+ logres = trans->t_log_res;
+ count = trans->t_log_count;
+ *tpp = xfs_trans_dup(trans);
+
+ /*
+ * Commit the current transaction.
+ * If this commit failed, then it'd just unlock those items that
+ * are not marked ihold. That also means that a filesystem shutdown
+ * is in progress. The caller takes the responsibility to cancel
+ * the duplicate transaction that gets returned.
+ */
+ error = xfs_trans_commit(trans, 0);
+ if (error)
+ return (error);
+
+ trans = *tpp;
+
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(trans->t_ticket);
+
+
+ /*
+ * Reserve space in the log for th next transaction.
+ * This also pushes items in the "AIL", the list of logged items,
+ * out to disk if they are taking up space at the tail of the log
+ * that we want to use. This requires that either nothing be locked
+ * across this call, or that anything that is locked be logged in
+ * the prior and the next transactions.
+ */
+ error = xfs_trans_reserve(trans, 0, logres, 0,
+ XFS_TRANS_PERM_LOG_RES, count);
+ /*
+ * Ensure that the inode is in the new transaction and locked.
+ */
+ if (error)
+ return error;
+
+ xfs_trans_ijoin(trans, dp);
+ return 0;
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
new file mode 100644
index 00000000..53597f4d
--- /dev/null
+++ b/fs/xfs/xfs_trans.h
@@ -0,0 +1,506 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_TRANS_H__
+#define __XFS_TRANS_H__
+
+struct xfs_log_item;
+
+/*
+ * This is the structure written in the log at the head of
+ * every transaction. It identifies the type and id of the
+ * transaction, and contains the number of items logged by
+ * the transaction so we know how many to expect during recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+ uint th_magic; /* magic number */
+ uint th_type; /* transaction type */
+ __int32_t th_tid; /* transaction id (unused) */
+ uint th_num_items; /* num items logged by trans */
+} xfs_trans_header_t;
+
+#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
+
+/*
+ * Log item types.
+ */
+#define XFS_LI_EFI 0x1236
+#define XFS_LI_EFD 0x1237
+#define XFS_LI_IUNLINK 0x1238
+#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */
+#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
+#define XFS_LI_DQUOT 0x123d
+#define XFS_LI_QUOTAOFF 0x123e
+
+#define XFS_LI_TYPE_DESC \
+ { XFS_LI_EFI, "XFS_LI_EFI" }, \
+ { XFS_LI_EFD, "XFS_LI_EFD" }, \
+ { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
+ { XFS_LI_INODE, "XFS_LI_INODE" }, \
+ { XFS_LI_BUF, "XFS_LI_BUF" }, \
+ { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
+ { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }
+
+/*
+ * Transaction types. Used to distinguish types of buffers.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE 1
+#define XFS_TRANS_SETATTR_SIZE 2
+#define XFS_TRANS_INACTIVE 3
+#define XFS_TRANS_CREATE 4
+#define XFS_TRANS_CREATE_TRUNC 5
+#define XFS_TRANS_TRUNCATE_FILE 6
+#define XFS_TRANS_REMOVE 7
+#define XFS_TRANS_LINK 8
+#define XFS_TRANS_RENAME 9
+#define XFS_TRANS_MKDIR 10
+#define XFS_TRANS_RMDIR 11
+#define XFS_TRANS_SYMLINK 12
+#define XFS_TRANS_SET_DMATTRS 13
+#define XFS_TRANS_GROWFS 14
+#define XFS_TRANS_STRAT_WRITE 15
+#define XFS_TRANS_DIOSTRAT 16
+/* 17 was XFS_TRANS_WRITE_SYNC */
+#define XFS_TRANS_WRITEID 18
+#define XFS_TRANS_ADDAFORK 19
+#define XFS_TRANS_ATTRINVAL 20
+#define XFS_TRANS_ATRUNCATE 21
+#define XFS_TRANS_ATTR_SET 22
+#define XFS_TRANS_ATTR_RM 23
+#define XFS_TRANS_ATTR_FLAG 24
+#define XFS_TRANS_CLEAR_AGI_BUCKET 25
+#define XFS_TRANS_QM_SBCHANGE 26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1 27
+#define XFS_TRANS_DUMMY2 28
+#define XFS_TRANS_QM_QUOTAOFF 29
+#define XFS_TRANS_QM_DQALLOC 30
+#define XFS_TRANS_QM_SETQLIM 31
+#define XFS_TRANS_QM_DQCLUSTER 32
+#define XFS_TRANS_QM_QINOCREATE 33
+#define XFS_TRANS_QM_QUOTAOFF_END 34
+#define XFS_TRANS_SB_UNIT 35
+#define XFS_TRANS_FSYNC_TS 36
+#define XFS_TRANS_GROWFSRT_ALLOC 37
+#define XFS_TRANS_GROWFSRT_ZERO 38
+#define XFS_TRANS_GROWFSRT_FREE 39
+#define XFS_TRANS_SWAPEXT 40
+#define XFS_TRANS_SB_COUNT 41
+#define XFS_TRANS_CHECKPOINT 42
+#define XFS_TRANS_TYPE_MAX 42
+/* new transaction types need to be reflected in xfs_logprint(8) */
+
+#define XFS_TRANS_TYPES \
+ { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
+ { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
+ { XFS_TRANS_INACTIVE, "INACTIVE" }, \
+ { XFS_TRANS_CREATE, "CREATE" }, \
+ { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
+ { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
+ { XFS_TRANS_REMOVE, "REMOVE" }, \
+ { XFS_TRANS_LINK, "LINK" }, \
+ { XFS_TRANS_RENAME, "RENAME" }, \
+ { XFS_TRANS_MKDIR, "MKDIR" }, \
+ { XFS_TRANS_RMDIR, "RMDIR" }, \
+ { XFS_TRANS_SYMLINK, "SYMLINK" }, \
+ { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
+ { XFS_TRANS_GROWFS, "GROWFS" }, \
+ { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
+ { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
+ { XFS_TRANS_WRITEID, "WRITEID" }, \
+ { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
+ { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
+ { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
+ { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
+ { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
+ { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
+ { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
+ { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \
+ { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
+ { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
+ { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
+ { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
+ { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
+ { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
+ { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
+ { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
+ { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
+ { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
+ { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
+ { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
+ { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
+ { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
+ { XFS_TRANS_DUMMY1, "DUMMY1" }, \
+ { XFS_TRANS_DUMMY2, "DUMMY2" }, \
+ { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
+
+/*
+ * This structure is used to track log items associated with
+ * a transaction. It points to the log item and keeps some
+ * flags to track the state of the log item. It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+struct xfs_log_item_desc {
+ struct xfs_log_item *lid_item;
+ ushort lid_size;
+ unsigned char lid_flags;
+ struct list_head lid_trans;
+};
+
+#define XFS_LID_DIRTY 0x1
+
+#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */
+/*
+ * Values for t_flags.
+ */
+#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
+#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
+#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
+#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
+
+/*
+ * Values for call flags parameter.
+ */
+#define XFS_TRANS_RELEASE_LOG_RES 0x4
+#define XFS_TRANS_ABORT 0x8
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define XFS_TRANS_SB_ICOUNT 0x00000001
+#define XFS_TRANS_SB_IFREE 0x00000002
+#define XFS_TRANS_SB_FDBLOCKS 0x00000004
+#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
+#define XFS_TRANS_SB_FREXTENTS 0x00000010
+#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
+#define XFS_TRANS_SB_DBLOCKS 0x00000040
+#define XFS_TRANS_SB_AGCOUNT 0x00000080
+#define XFS_TRANS_SB_IMAXPCT 0x00000100
+#define XFS_TRANS_SB_REXTSIZE 0x00000200
+#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
+#define XFS_TRANS_SB_RBLOCKS 0x00000800
+#define XFS_TRANS_SB_REXTENTS 0x00001000
+#define XFS_TRANS_SB_REXTSLOG 0x00002000
+
+
+/*
+ * Per-extent log reservation for the allocation btree changes
+ * involved in freeing or allocating an extent.
+ * 2 trees * (2 blocks/level * max depth - 1) * block size
+ */
+#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
+ ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
+ ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define XFS_DIROP_LOG_RES(mp) \
+ (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+ (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define XFS_DIROP_LOG_COUNT(mp) \
+ (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+ XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+
+
+#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write)
+#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
+#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename)
+#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link)
+#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove)
+#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
+#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
+#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir)
+#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree)
+#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
+#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata)
+#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc)
+#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero)
+#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree)
+#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
+/*
+ * Logging the inode timestamps on an fsync -- same as SWRITE
+ * as long as SWRITE logs the entire inode core
+ */
+#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
+#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
+#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork)
+#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval)
+#define XFS_ATTRSET_LOG_RES(mp, ext) \
+ ((mp)->m_reservations.tr_attrset + \
+ (ext * (mp)->m_sb.sb_sectsize) + \
+ (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
+ (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
+#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
+#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi)
+
+
+/*
+ * Various log count values.
+ */
+#define XFS_DEFAULT_LOG_COUNT 1
+#define XFS_DEFAULT_PERM_LOG_COUNT 2
+#define XFS_ITRUNCATE_LOG_COUNT 2
+#define XFS_INACTIVE_LOG_COUNT 2
+#define XFS_CREATE_LOG_COUNT 2
+#define XFS_MKDIR_LOG_COUNT 3
+#define XFS_SYMLINK_LOG_COUNT 3
+#define XFS_REMOVE_LOG_COUNT 2
+#define XFS_LINK_LOG_COUNT 2
+#define XFS_RENAME_LOG_COUNT 2
+#define XFS_WRITE_LOG_COUNT 2
+#define XFS_ADDAFORK_LOG_COUNT 2
+#define XFS_ATTRINVAL_LOG_COUNT 1
+#define XFS_ATTRSET_LOG_COUNT 3
+#define XFS_ATTRRM_LOG_COUNT 3
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer
+ * reference count values. This determine how hard the buffer
+ * cache tries to hold onto the buffer.
+ */
+#define XFS_AGF_REF 4
+#define XFS_AGI_REF 4
+#define XFS_AGFL_REF 3
+#define XFS_INO_BTREE_REF 3
+#define XFS_ALLOC_BTREE_REF 2
+#define XFS_BMAP_BTREE_REF 2
+#define XFS_DIR_BTREE_REF 2
+#define XFS_INO_REF 2
+#define XFS_ATTR_BTREE_REF 1
+#define XFS_DQUOT_REF 1
+
+#ifdef __KERNEL__
+
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot_acct;
+struct xfs_busy_extent;
+
+typedef struct xfs_log_item {
+ struct list_head li_ail; /* AIL pointers */
+ xfs_lsn_t li_lsn; /* last on-disk lsn */
+ struct xfs_log_item_desc *li_desc; /* ptr to current desc*/
+ struct xfs_mount *li_mountp; /* ptr to fs mount */
+ struct xfs_ail *li_ailp; /* ptr to AIL */
+ uint li_type; /* item type */
+ uint li_flags; /* misc flags */
+ struct xfs_log_item *li_bio_list; /* buffer item list */
+ void (*li_cb)(struct xfs_buf *,
+ struct xfs_log_item *);
+ /* buffer item iodone */
+ /* callback func */
+ struct xfs_item_ops *li_ops; /* function list */
+
+ /* delayed logging */
+ struct list_head li_cil; /* CIL pointers */
+ struct xfs_log_vec *li_lv; /* active log vector */
+ xfs_lsn_t li_seq; /* CIL commit seq */
+} xfs_log_item_t;
+
+#define XFS_LI_IN_AIL 0x1
+#define XFS_LI_ABORTED 0x2
+
+#define XFS_LI_FLAGS \
+ { XFS_LI_IN_AIL, "IN_AIL" }, \
+ { XFS_LI_ABORTED, "ABORTED" }
+
+typedef struct xfs_item_ops {
+ uint (*iop_size)(xfs_log_item_t *);
+ void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+ void (*iop_pin)(xfs_log_item_t *);
+ void (*iop_unpin)(xfs_log_item_t *, int remove);
+ uint (*iop_trylock)(xfs_log_item_t *);
+ void (*iop_unlock)(xfs_log_item_t *);
+ xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+ void (*iop_push)(xfs_log_item_t *);
+ bool (*iop_pushbuf)(xfs_log_item_t *);
+ void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+} xfs_item_ops_t;
+
+#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
+#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
+#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
+#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove)
+#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
+#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
+#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip)
+#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip)
+#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+
+/*
+ * Return values for the IOP_TRYLOCK() routines.
+ */
+#define XFS_ITEM_SUCCESS 0
+#define XFS_ITEM_PINNED 1
+#define XFS_ITEM_LOCKED 2
+#define XFS_ITEM_PUSHBUF 3
+
+/*
+ * This is the type of function which can be given to xfs_trans_callback()
+ * to be called upon the transaction's commit to disk.
+ */
+typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
+
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+ unsigned int t_magic; /* magic number */
+ xfs_log_callback_t t_logcb; /* log callback struct */
+ unsigned int t_type; /* transaction type */
+ unsigned int t_log_res; /* amt of log space resvd */
+ unsigned int t_log_count; /* count for perm log res */
+ unsigned int t_blk_res; /* # of blocks resvd */
+ unsigned int t_blk_res_used; /* # of resvd blocks used */
+ unsigned int t_rtx_res; /* # of rt extents resvd */
+ unsigned int t_rtx_res_used; /* # of resvd rt extents used */
+ struct xlog_ticket *t_ticket; /* log mgr ticket */
+ xfs_lsn_t t_lsn; /* log seq num of start of
+ * transaction. */
+ xfs_lsn_t t_commit_lsn; /* log seq num of end of
+ * transaction. */
+ struct xfs_mount *t_mountp; /* ptr to fs mount struct */
+ struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
+ unsigned int t_flags; /* misc flags */
+ int64_t t_icount_delta; /* superblock icount change */
+ int64_t t_ifree_delta; /* superblock ifree change */
+ int64_t t_fdblocks_delta; /* superblock fdblocks chg */
+ int64_t t_res_fdblocks_delta; /* on-disk only chg */
+ int64_t t_frextents_delta;/* superblock freextents chg*/
+ int64_t t_res_frextents_delta; /* on-disk only chg */
+#ifdef DEBUG
+ int64_t t_ag_freeblks_delta; /* debugging counter */
+ int64_t t_ag_flist_delta; /* debugging counter */
+ int64_t t_ag_btree_delta; /* debugging counter */
+#endif
+ int64_t t_dblocks_delta;/* superblock dblocks change */
+ int64_t t_agcount_delta;/* superblock agcount change */
+ int64_t t_imaxpct_delta;/* superblock imaxpct change */
+ int64_t t_rextsize_delta;/* superblock rextsize chg */
+ int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */
+ int64_t t_rblocks_delta;/* superblock rblocks change */
+ int64_t t_rextents_delta;/* superblocks rextents chg */
+ int64_t t_rextslog_delta;/* superblocks rextslog chg */
+ struct list_head t_items; /* log item descriptors */
+ xfs_trans_header_t t_header; /* header for in-log trans */
+ struct list_head t_busy; /* list of busy extents */
+ unsigned long t_pflags; /* saved process flags state */
+} xfs_trans_t;
+
+/*
+ * XFS transaction mechanism exported interfaces that are
+ * actually macros.
+ */
+#define xfs_trans_get_log_res(tp) ((tp)->t_log_res)
+#define xfs_trans_get_log_count(tp) ((tp)->t_log_count)
+#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res)
+#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
+
+#ifdef DEBUG
+#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d)
+#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d)
+#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d)
+#else
+#define xfs_trans_agblocks_delta(tp, d)
+#define xfs_trans_agflist_delta(tp, d)
+#define xfs_trans_agbtree_delta(tp, d)
+#endif
+
+/*
+ * XFS transaction mechanism exported interfaces.
+ */
+xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint);
+xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
+int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
+ uint, uint);
+void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
+struct xfs_buf *xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t,
+ int, uint);
+int xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *,
+ struct xfs_buftarg *, xfs_daddr_t, int, uint,
+ struct xfs_buf **);
+struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
+
+void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
+void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
+void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
+void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
+void xfs_efi_release(struct xfs_efi_log_item *, uint);
+void xfs_trans_log_efi_extent(xfs_trans_t *,
+ struct xfs_efi_log_item *,
+ xfs_fsblock_t,
+ xfs_extlen_t);
+struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *,
+ struct xfs_efi_log_item *,
+ uint);
+void xfs_trans_log_efd_extent(xfs_trans_t *,
+ struct xfs_efd_log_item *,
+ xfs_fsblock_t,
+ xfs_extlen_t);
+int _xfs_trans_commit(xfs_trans_t *,
+ uint flags,
+ int *);
+#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL)
+void xfs_trans_cancel(xfs_trans_t *, int);
+int xfs_trans_ail_init(struct xfs_mount *);
+void xfs_trans_ail_destroy(struct xfs_mount *);
+
+extern kmem_zone_t *xfs_trans_zone;
+extern kmem_zone_t *xfs_log_item_desc_zone;
+
+#endif /* __KERNEL__ */
+
+void xfs_trans_init(struct xfs_mount *);
+int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
+
+#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
new file mode 100644
index 00000000..a4c281bf
--- /dev/null
+++ b/fs/xfs/xfs_trans_ail.c
@@ -0,0 +1,890 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+
+#ifdef DEBUG
+/*
+ * Check that the list is sorted as it should be.
+ */
+STATIC void
+xfs_ail_check(
+ struct xfs_ail *ailp,
+ xfs_log_item_t *lip)
+{
+ xfs_log_item_t *prev_lip;
+
+ if (list_empty(&ailp->xa_ail))
+ return;
+
+ /*
+ * Check the next and previous entries are valid.
+ */
+ ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+ prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+ if (&prev_lip->li_ail != &ailp->xa_ail)
+ ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+
+ prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
+ if (&prev_lip->li_ail != &ailp->xa_ail)
+ ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
+
+
+#ifdef XFS_TRANS_DEBUG
+ /*
+ * Walk the list checking lsn ordering, and that every entry has the
+ * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
+ * when specifically debugging the transaction subsystem.
+ */
+ prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+ list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+ if (&prev_lip->li_ail != &ailp->xa_ail)
+ ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+ ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+ prev_lip = lip;
+ }
+#endif /* XFS_TRANS_DEBUG */
+}
+#else /* !DEBUG */
+#define xfs_ail_check(a,l)
+#endif /* DEBUG */
+
+/*
+ * Return a pointer to the first item in the AIL. If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_min(
+ struct xfs_ail *ailp)
+{
+ if (list_empty(&ailp->xa_ail))
+ return NULL;
+
+ return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+}
+
+ /*
+ * Return a pointer to the last item in the AIL. If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_max(
+ struct xfs_ail *ailp)
+{
+ if (list_empty(&ailp->xa_ail))
+ return NULL;
+
+ return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
+}
+
+/*
+ * Return a pointer to the item which follows the given item in the AIL. If
+ * the given item is the last item in the list, then return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_next(
+ struct xfs_ail *ailp,
+ xfs_log_item_t *lip)
+{
+ if (lip->li_ail.next == &ailp->xa_ail)
+ return NULL;
+
+ return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
+}
+
+/*
+ * This is called by the log manager code to determine the LSN of the tail of
+ * the log. This is exactly the LSN of the first item in the AIL. If the AIL
+ * is empty, then this function returns 0.
+ *
+ * We need the AIL lock in order to get a coherent read of the lsn of the last
+ * item in the AIL.
+ */
+xfs_lsn_t
+xfs_ail_min_lsn(
+ struct xfs_ail *ailp)
+{
+ xfs_lsn_t lsn = 0;
+ xfs_log_item_t *lip;
+
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_ail_min(ailp);
+ if (lip)
+ lsn = lip->li_lsn;
+ spin_unlock(&ailp->xa_lock);
+
+ return lsn;
+}
+
+/*
+ * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
+ */
+static xfs_lsn_t
+xfs_ail_max_lsn(
+ struct xfs_ail *ailp)
+{
+ xfs_lsn_t lsn = 0;
+ xfs_log_item_t *lip;
+
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_ail_max(ailp);
+ if (lip)
+ lsn = lip->li_lsn;
+ spin_unlock(&ailp->xa_lock);
+
+ return lsn;
+}
+
+/*
+ * AIL traversal cursor initialisation.
+ *
+ * The cursor keeps track of where our current traversal is up
+ * to by tracking the next ƣtem in the list for us. However, for
+ * this to be safe, removing an object from the AIL needs to invalidate
+ * any cursor that points to it. hence the traversal cursor needs to
+ * be linked to the struct xfs_ail so that deletion can search all the
+ * active cursors for invalidation.
+ *
+ * We don't link the push cursor because it is embedded in the struct
+ * xfs_ail and hence easily findable.
+ */
+STATIC void
+xfs_trans_ail_cursor_init(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur)
+{
+ cur->item = NULL;
+ if (cur == &ailp->xa_cursors)
+ return;
+
+ cur->next = ailp->xa_cursors.next;
+ ailp->xa_cursors.next = cur;
+}
+
+/*
+ * Set the cursor to the next item, because when we look
+ * up the cursor the current item may have been freed.
+ */
+STATIC void
+xfs_trans_ail_cursor_set(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ struct xfs_log_item *lip)
+{
+ if (lip)
+ cur->item = xfs_ail_next(ailp, lip);
+}
+
+/*
+ * Get the next item in the traversal and advance the cursor.
+ * If the cursor was invalidated (inidicated by a lip of 1),
+ * restart the traversal.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_next(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur)
+{
+ struct xfs_log_item *lip = cur->item;
+
+ if ((__psint_t)lip & 1)
+ lip = xfs_ail_min(ailp);
+ xfs_trans_ail_cursor_set(ailp, cur, lip);
+ return lip;
+}
+
+/*
+ * Now that the traversal is complete, we need to remove the cursor
+ * from the list of traversing cursors. Avoid removing the embedded
+ * push cursor, but use the fact it is always present to make the
+ * list deletion simple.
+ */
+void
+xfs_trans_ail_cursor_done(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *done)
+{
+ struct xfs_ail_cursor *prev = NULL;
+ struct xfs_ail_cursor *cur;
+
+ done->item = NULL;
+ if (done == &ailp->xa_cursors)
+ return;
+ prev = &ailp->xa_cursors;
+ for (cur = prev->next; cur; prev = cur, cur = prev->next) {
+ if (cur == done) {
+ prev->next = cur->next;
+ break;
+ }
+ }
+ ASSERT(cur);
+}
+
+/*
+ * Invalidate any cursor that is pointing to this item. This is
+ * called when an item is removed from the AIL. Any cursor pointing
+ * to this object is now invalid and the traversal needs to be
+ * terminated so it doesn't reference a freed object. We set the
+ * cursor item to a value of 1 so we can distinguish between an
+ * invalidation and the end of the list when getting the next item
+ * from the cursor.
+ */
+STATIC void
+xfs_trans_ail_cursor_clear(
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_ail_cursor *cur;
+
+ /* need to search all cursors */
+ for (cur = &ailp->xa_cursors; cur; cur = cur->next) {
+ if (cur->item == lip)
+ cur->item = (struct xfs_log_item *)
+ ((__psint_t)cur->item | 1);
+ }
+}
+
+/*
+ * Initialise the cursor to the first item in the AIL with the given @lsn.
+ * This searches the list from lowest LSN to highest. Pass a @lsn of zero
+ * to initialise the cursor to the first item in the AIL.
+ */
+xfs_log_item_t *
+xfs_trans_ail_cursor_first(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ xfs_lsn_t lsn)
+{
+ xfs_log_item_t *lip;
+
+ xfs_trans_ail_cursor_init(ailp, cur);
+ lip = xfs_ail_min(ailp);
+ if (lsn == 0)
+ goto out;
+
+ list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+ if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
+ goto out;
+ }
+ lip = NULL;
+out:
+ xfs_trans_ail_cursor_set(ailp, cur, lip);
+ return lip;
+}
+
+/*
+ * Initialise the cursor to the last item in the AIL with the given @lsn.
+ * This searches the list from highest LSN to lowest. If there is no item with
+ * the value of @lsn, then it sets the cursor to the last item with an LSN lower
+ * than @lsn.
+ */
+static struct xfs_log_item *
+__xfs_trans_ail_cursor_last(
+ struct xfs_ail *ailp,
+ xfs_lsn_t lsn)
+{
+ xfs_log_item_t *lip;
+
+ list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) {
+ if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0)
+ return lip;
+ }
+ return NULL;
+}
+
+/*
+ * Initialise the cursor to the last item in the AIL with the given @lsn.
+ * This searches the list from highest LSN to lowest.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_last(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ xfs_lsn_t lsn)
+{
+ xfs_trans_ail_cursor_init(ailp, cur);
+ cur->item = __xfs_trans_ail_cursor_last(ailp, lsn);
+ return cur->item;
+}
+
+/*
+ * splice the log item list into the AIL at the given LSN. We splice to the
+ * tail of the given LSN to maintain insert order for push traversals. The
+ * cursor is optional, allowing repeated updates to the same LSN to avoid
+ * repeated traversals.
+ */
+static void
+xfs_ail_splice(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ struct list_head *list,
+ xfs_lsn_t lsn)
+{
+ struct xfs_log_item *lip = cur ? cur->item : NULL;
+ struct xfs_log_item *next_lip;
+
+ /*
+ * Get a new cursor if we don't have a placeholder or the existing one
+ * has been invalidated.
+ */
+ if (!lip || (__psint_t)lip & 1) {
+ lip = __xfs_trans_ail_cursor_last(ailp, lsn);
+
+ if (!lip) {
+ /* The list is empty, so just splice and return. */
+ if (cur)
+ cur->item = NULL;
+ list_splice(list, &ailp->xa_ail);
+ return;
+ }
+ }
+
+ /*
+ * Our cursor points to the item we want to insert _after_, so we have
+ * to update the cursor to point to the end of the list we are splicing
+ * in so that it points to the correct location for the next splice.
+ * i.e. before the splice
+ *
+ * lsn -> lsn -> lsn + x -> lsn + x ...
+ * ^
+ * | cursor points here
+ *
+ * After the splice we have:
+ *
+ * lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ...
+ * ^ ^
+ * | cursor points here | needs to move here
+ *
+ * So we set the cursor to the last item in the list to be spliced
+ * before we execute the splice, resulting in the cursor pointing to
+ * the correct item after the splice occurs.
+ */
+ if (cur) {
+ next_lip = list_entry(list->prev, struct xfs_log_item, li_ail);
+ cur->item = next_lip;
+ }
+ list_splice(list, &lip->li_ail);
+}
+
+/*
+ * Delete the given item from the AIL. Return a pointer to the item.
+ */
+static void
+xfs_ail_delete(
+ struct xfs_ail *ailp,
+ xfs_log_item_t *lip)
+{
+ xfs_ail_check(ailp, lip);
+ list_del(&lip->li_ail);
+ xfs_trans_ail_cursor_clear(ailp, lip);
+}
+
+static long
+xfsaild_push(
+ struct xfs_ail *ailp)
+{
+ xfs_mount_t *mp = ailp->xa_mount;
+ struct xfs_ail_cursor *cur = &ailp->xa_cursors;
+ xfs_log_item_t *lip;
+ xfs_lsn_t lsn;
+ xfs_lsn_t target;
+ long tout = 10;
+ int flush_log = 0;
+ int stuck = 0;
+ int count = 0;
+ int push_xfsbufd = 0;
+
+ spin_lock(&ailp->xa_lock);
+ target = ailp->xa_target;
+ xfs_trans_ail_cursor_init(ailp, cur);
+ lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn);
+ if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
+ /*
+ * AIL is empty or our push has reached the end.
+ */
+ xfs_trans_ail_cursor_done(ailp, cur);
+ spin_unlock(&ailp->xa_lock);
+ goto out_done;
+ }
+
+ XFS_STATS_INC(xs_push_ail);
+
+ /*
+ * While the item we are looking at is below the given threshold
+ * try to flush it out. We'd like not to stop until we've at least
+ * tried to push on everything in the AIL with an LSN less than
+ * the given threshold.
+ *
+ * However, we will stop after a certain number of pushes and wait
+ * for a reduced timeout to fire before pushing further. This
+ * prevents use from spinning when we can't do anything or there is
+ * lots of contention on the AIL lists.
+ */
+ lsn = lip->li_lsn;
+ while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
+ int lock_result;
+ /*
+ * If we can lock the item without sleeping, unlock the AIL
+ * lock and flush the item. Then re-grab the AIL lock so we
+ * can look for the next item on the AIL. List changes are
+ * handled by the AIL lookup functions internally
+ *
+ * If we can't lock the item, either its holder will flush it
+ * or it is already being flushed or it is being relogged. In
+ * any of these case it is being taken care of and we can just
+ * skip to the next item in the list.
+ */
+ lock_result = IOP_TRYLOCK(lip);
+ spin_unlock(&ailp->xa_lock);
+ switch (lock_result) {
+ case XFS_ITEM_SUCCESS:
+ XFS_STATS_INC(xs_push_ail_success);
+ IOP_PUSH(lip);
+ ailp->xa_last_pushed_lsn = lsn;
+ break;
+
+ case XFS_ITEM_PUSHBUF:
+ XFS_STATS_INC(xs_push_ail_pushbuf);
+
+ if (!IOP_PUSHBUF(lip)) {
+ stuck++;
+ flush_log = 1;
+ } else {
+ ailp->xa_last_pushed_lsn = lsn;
+ }
+ push_xfsbufd = 1;
+ break;
+
+ case XFS_ITEM_PINNED:
+ XFS_STATS_INC(xs_push_ail_pinned);
+ stuck++;
+ flush_log = 1;
+ break;
+
+ case XFS_ITEM_LOCKED:
+ XFS_STATS_INC(xs_push_ail_locked);
+ stuck++;
+ break;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ spin_lock(&ailp->xa_lock);
+ /* should we bother continuing? */
+ if (XFS_FORCED_SHUTDOWN(mp))
+ break;
+ ASSERT(mp->m_log);
+
+ count++;
+
+ /*
+ * Are there too many items we can't do anything with?
+ * If we we are skipping too many items because we can't flush
+ * them or they are already being flushed, we back off and
+ * given them time to complete whatever operation is being
+ * done. i.e. remove pressure from the AIL while we can't make
+ * progress so traversals don't slow down further inserts and
+ * removals to/from the AIL.
+ *
+ * The value of 100 is an arbitrary magic number based on
+ * observation.
+ */
+ if (stuck > 100)
+ break;
+
+ lip = xfs_trans_ail_cursor_next(ailp, cur);
+ if (lip == NULL)
+ break;
+ lsn = lip->li_lsn;
+ }
+ xfs_trans_ail_cursor_done(ailp, cur);
+ spin_unlock(&ailp->xa_lock);
+
+ if (flush_log) {
+ /*
+ * If something we need to push out was pinned, then
+ * push out the log so it will become unpinned and
+ * move forward in the AIL.
+ */
+ XFS_STATS_INC(xs_push_ail_flush);
+ xfs_log_force(mp, 0);
+ }
+
+ if (push_xfsbufd) {
+ /* we've got delayed write buffers to flush */
+ wake_up_process(mp->m_ddev_targp->bt_task);
+ }
+
+ /* assume we have more work to do in a short while */
+out_done:
+ if (!count) {
+ /* We're past our target or empty, so idle */
+ ailp->xa_last_pushed_lsn = 0;
+
+ tout = 50;
+ } else if (XFS_LSN_CMP(lsn, target) >= 0) {
+ /*
+ * We reached the target so wait a bit longer for I/O to
+ * complete and remove pushed items from the AIL before we
+ * start the next scan from the start of the AIL.
+ */
+ tout = 50;
+ ailp->xa_last_pushed_lsn = 0;
+ } else if ((stuck * 100) / count > 90) {
+ /*
+ * Either there is a lot of contention on the AIL or we
+ * are stuck due to operations in progress. "Stuck" in this
+ * case is defined as >90% of the items we tried to push
+ * were stuck.
+ *
+ * Backoff a bit more to allow some I/O to complete before
+ * continuing from where we were.
+ */
+ tout = 20;
+ }
+
+ return tout;
+}
+
+static int
+xfsaild(
+ void *data)
+{
+ struct xfs_ail *ailp = data;
+ long tout = 0; /* milliseconds */
+
+ while (!kthread_should_stop()) {
+ if (tout && tout <= 20)
+ __set_current_state(TASK_KILLABLE);
+ else
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(tout ?
+ msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+
+ try_to_freeze();
+
+ tout = xfsaild_push(ailp);
+ }
+
+ return 0;
+}
+
+/*
+ * This routine is called to move the tail of the AIL forward. It does this by
+ * trying to flush items in the AIL whose lsns are below the given
+ * threshold_lsn.
+ *
+ * The push is run asynchronously in a workqueue, which means the caller needs
+ * to handle waiting on the async flush for space to become available.
+ * We don't want to interrupt any push that is in progress, hence we only queue
+ * work if we set the pushing bit approriately.
+ *
+ * We do this unlocked - we only need to know whether there is anything in the
+ * AIL at the time we are called. We don't need to access the contents of
+ * any of the objects, so the lock is not needed.
+ */
+void
+xfs_ail_push(
+ struct xfs_ail *ailp,
+ xfs_lsn_t threshold_lsn)
+{
+ xfs_log_item_t *lip;
+
+ lip = xfs_ail_min(ailp);
+ if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) ||
+ XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0)
+ return;
+
+ /*
+ * Ensure that the new target is noticed in push code before it clears
+ * the XFS_AIL_PUSHING_BIT.
+ */
+ smp_wmb();
+ xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
+ smp_wmb();
+
+ wake_up_process(ailp->xa_task);
+}
+
+/*
+ * Push out all items in the AIL immediately
+ */
+void
+xfs_ail_push_all(
+ struct xfs_ail *ailp)
+{
+ xfs_lsn_t threshold_lsn = xfs_ail_max_lsn(ailp);
+
+ if (threshold_lsn)
+ xfs_ail_push(ailp, threshold_lsn);
+}
+
+/*
+ * This is to be called when an item is unlocked that may have
+ * been in the AIL. It will wake up the first member of the AIL
+ * wait list if this item's unlocking might allow it to progress.
+ * If the item is in the AIL, then we need to get the AIL lock
+ * while doing our checking so we don't race with someone going
+ * to sleep waiting for this event in xfs_trans_push_ail().
+ */
+void
+xfs_trans_unlocked_item(
+ struct xfs_ail *ailp,
+ xfs_log_item_t *lip)
+{
+ xfs_log_item_t *min_lip;
+
+ /*
+ * If we're forcibly shutting down, we may have
+ * unlocked log items arbitrarily. The last thing
+ * we want to do is to move the tail of the log
+ * over some potentially valid data.
+ */
+ if (!(lip->li_flags & XFS_LI_IN_AIL) ||
+ XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+ return;
+ }
+
+ /*
+ * This is the one case where we can call into xfs_ail_min()
+ * without holding the AIL lock because we only care about the
+ * case where we are at the tail of the AIL. If the object isn't
+ * at the tail, it doesn't matter what result we get back. This
+ * is slightly racy because since we were just unlocked, we could
+ * go to sleep between the call to xfs_ail_min and the call to
+ * xfs_log_move_tail, have someone else lock us, commit to us disk,
+ * move us out of the tail of the AIL, and then we wake up. However,
+ * the call to xfs_log_move_tail() doesn't do anything if there's
+ * not enough free space to wake people up so we're safe calling it.
+ */
+ min_lip = xfs_ail_min(ailp);
+
+ if (min_lip == lip)
+ xfs_log_move_tail(ailp->xa_mount, 1);
+} /* xfs_trans_unlocked_item */
+
+/*
+ * xfs_trans_ail_update - bulk AIL insertion operation.
+ *
+ * @xfs_trans_ail_update takes an array of log items that all need to be
+ * positioned at the same LSN in the AIL. If an item is not in the AIL, it will
+ * be added. Otherwise, it will be repositioned by removing it and re-adding
+ * it to the AIL. If we move the first item in the AIL, update the log tail to
+ * match the new minimum LSN in the AIL.
+ *
+ * This function takes the AIL lock once to execute the update operations on
+ * all the items in the array, and as such should not be called with the AIL
+ * lock held. As a result, once we have the AIL lock, we need to check each log
+ * item LSN to confirm it needs to be moved forward in the AIL.
+ *
+ * To optimise the insert operation, we delete all the items from the AIL in
+ * the first pass, moving them into a temporary list, then splice the temporary
+ * list into the correct position in the AIL. This avoids needing to do an
+ * insert operation on every item.
+ *
+ * This function must be called with the AIL lock held. The lock is dropped
+ * before returning.
+ */
+void
+xfs_trans_ail_update_bulk(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ struct xfs_log_item **log_items,
+ int nr_items,
+ xfs_lsn_t lsn) __releases(ailp->xa_lock)
+{
+ xfs_log_item_t *mlip;
+ xfs_lsn_t tail_lsn;
+ int mlip_changed = 0;
+ int i;
+ LIST_HEAD(tmp);
+
+ mlip = xfs_ail_min(ailp);
+
+ for (i = 0; i < nr_items; i++) {
+ struct xfs_log_item *lip = log_items[i];
+ if (lip->li_flags & XFS_LI_IN_AIL) {
+ /* check if we really need to move the item */
+ if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
+ continue;
+
+ xfs_ail_delete(ailp, lip);
+ if (mlip == lip)
+ mlip_changed = 1;
+ } else {
+ lip->li_flags |= XFS_LI_IN_AIL;
+ }
+ lip->li_lsn = lsn;
+ list_add(&lip->li_ail, &tmp);
+ }
+
+ xfs_ail_splice(ailp, cur, &tmp, lsn);
+
+ if (!mlip_changed) {
+ spin_unlock(&ailp->xa_lock);
+ return;
+ }
+
+ /*
+ * It is not safe to access mlip after the AIL lock is dropped, so we
+ * must get a copy of li_lsn before we do so. This is especially
+ * important on 32-bit platforms where accessing and updating 64-bit
+ * values like li_lsn is not atomic.
+ */
+ mlip = xfs_ail_min(ailp);
+ tail_lsn = mlip->li_lsn;
+ spin_unlock(&ailp->xa_lock);
+ xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
+
+/*
+ * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
+ *
+ * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
+ * removed from the AIL. The caller is already holding the AIL lock, and done
+ * all the checks necessary to ensure the items passed in via @log_items are
+ * ready for deletion. This includes checking that the items are in the AIL.
+ *
+ * For each log item to be removed, unlink it from the AIL, clear the IN_AIL
+ * flag from the item and reset the item's lsn to 0. If we remove the first
+ * item in the AIL, update the log tail to match the new minimum LSN in the
+ * AIL.
+ *
+ * This function will not drop the AIL lock until all items are removed from
+ * the AIL to minimise the amount of lock traffic on the AIL. This does not
+ * greatly increase the AIL hold time, but does significantly reduce the amount
+ * of traffic on the lock, especially during IO completion.
+ *
+ * This function must be called with the AIL lock held. The lock is dropped
+ * before returning.
+ */
+void
+xfs_trans_ail_delete_bulk(
+ struct xfs_ail *ailp,
+ struct xfs_log_item **log_items,
+ int nr_items) __releases(ailp->xa_lock)
+{
+ xfs_log_item_t *mlip;
+ xfs_lsn_t tail_lsn;
+ int mlip_changed = 0;
+ int i;
+
+ mlip = xfs_ail_min(ailp);
+
+ for (i = 0; i < nr_items; i++) {
+ struct xfs_log_item *lip = log_items[i];
+ if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+ struct xfs_mount *mp = ailp->xa_mount;
+
+ spin_unlock(&ailp->xa_lock);
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+ "%s: attempting to delete a log item that is not in the AIL",
+ __func__);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+ return;
+ }
+
+ xfs_ail_delete(ailp, lip);
+ lip->li_flags &= ~XFS_LI_IN_AIL;
+ lip->li_lsn = 0;
+ if (mlip == lip)
+ mlip_changed = 1;
+ }
+
+ if (!mlip_changed) {
+ spin_unlock(&ailp->xa_lock);
+ return;
+ }
+
+ /*
+ * It is not safe to access mlip after the AIL lock is dropped, so we
+ * must get a copy of li_lsn before we do so. This is especially
+ * important on 32-bit platforms where accessing and updating 64-bit
+ * values like li_lsn is not atomic. It is possible we've emptied the
+ * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
+ */
+ mlip = xfs_ail_min(ailp);
+ tail_lsn = mlip ? mlip->li_lsn : 0;
+ spin_unlock(&ailp->xa_lock);
+ xfs_log_move_tail(ailp->xa_mount, tail_lsn);
+}
+
+/*
+ * The active item list (AIL) is a doubly linked list of log
+ * items sorted by ascending lsn. The base of the list is
+ * a forw/back pointer pair embedded in the xfs mount structure.
+ * The base is initialized with both pointers pointing to the
+ * base. This case always needs to be distinguished, because
+ * the base has no lsn to look at. We almost always insert
+ * at the end of the list, so on inserts we search from the
+ * end of the list to find where the new item belongs.
+ */
+
+/*
+ * Initialize the doubly linked list to point only to itself.
+ */
+int
+xfs_trans_ail_init(
+ xfs_mount_t *mp)
+{
+ struct xfs_ail *ailp;
+
+ ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+ if (!ailp)
+ return ENOMEM;
+
+ ailp->xa_mount = mp;
+ INIT_LIST_HEAD(&ailp->xa_ail);
+ spin_lock_init(&ailp->xa_lock);
+
+ ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+ ailp->xa_mount->m_fsname);
+ if (IS_ERR(ailp->xa_task))
+ goto out_free_ailp;
+
+ mp->m_ail = ailp;
+ return 0;
+
+out_free_ailp:
+ kmem_free(ailp);
+ return ENOMEM;
+}
+
+void
+xfs_trans_ail_destroy(
+ xfs_mount_t *mp)
+{
+ struct xfs_ail *ailp = mp->m_ail;
+
+ kthread_stop(ailp->xa_task);
+ kmem_free(ailp);
+}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
new file mode 100644
index 00000000..03b3b7f8
--- /dev/null
+++ b/fs/xfs/xfs_trans_buf.c
@@ -0,0 +1,879 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_trace.h"
+
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.
+ */
+STATIC struct xfs_buf *
+xfs_trans_buf_item_match(
+ struct xfs_trans *tp,
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ int len)
+{
+ struct xfs_log_item_desc *lidp;
+ struct xfs_buf_log_item *blip;
+
+ len = BBTOB(len);
+ list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+ blip = (struct xfs_buf_log_item *)lidp->lid_item;
+ if (blip->bli_item.li_type == XFS_LI_BUF &&
+ XFS_BUF_TARGET(blip->bli_buf) == target &&
+ XFS_BUF_ADDR(blip->bli_buf) == blkno &&
+ XFS_BUF_COUNT(blip->bli_buf) == len)
+ return blip->bli_buf;
+ }
+
+ return NULL;
+}
+
+/*
+ * Add the locked buffer to the transaction.
+ *
+ * The buffer must be locked, and it cannot be associated with any
+ * transaction.
+ *
+ * If the buffer does not yet have a buf log item associated with it,
+ * then allocate one for it. Then add the buf item to the transaction.
+ */
+STATIC void
+_xfs_trans_bjoin(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ int reset_recur)
+{
+ struct xfs_buf_log_item *bip;
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+
+ /*
+ * The xfs_buf_log_item pointer is stored in b_fsprivate. If
+ * it doesn't have one yet, then allocate one and initialize it.
+ * The checks to see if one is there are in xfs_buf_item_init().
+ */
+ xfs_buf_item_init(bp, tp->t_mountp);
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+ if (reset_recur)
+ bip->bli_recur = 0;
+
+ /*
+ * Take a reference for this transaction on the buf item.
+ */
+ atomic_inc(&bip->bli_refcount);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &bip->bli_item);
+
+ /*
+ * Initialize b_fsprivate2 so we can find it with incore_match()
+ * in xfs_trans_get_buf() and friends above.
+ */
+ XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+}
+
+void
+xfs_trans_bjoin(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ _xfs_trans_bjoin(tp, bp, 0);
+ trace_xfs_trans_bjoin(bp->b_fspriv);
+}
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction. If it is already locked
+ * within the transaction, just increment its lock recursion count
+ * and return a pointer to it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * get_buf() call.
+ */
+xfs_buf_t *
+xfs_trans_get_buf(xfs_trans_t *tp,
+ xfs_buftarg_t *target_dev,
+ xfs_daddr_t blkno,
+ int len,
+ uint flags)
+{
+ xfs_buf_t *bp;
+ xfs_buf_log_item_t *bip;
+
+ if (flags == 0)
+ flags = XBF_LOCK | XBF_MAPPED;
+
+ /*
+ * Default to a normal get_buf() call if the tp is NULL.
+ */
+ if (tp == NULL)
+ return xfs_buf_get(target_dev, blkno, len,
+ flags | XBF_DONT_BLOCK);
+
+ /*
+ * If we find the buffer in the cache with this transaction
+ * pointer in its b_fsprivate2 field, then we know we already
+ * have it locked. In this case we just increment the lock
+ * recursion count and return the buffer to the caller.
+ */
+ bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
+ if (bp != NULL) {
+ ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+ if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
+ XFS_BUF_SUPER_STALE(bp);
+
+ /*
+ * If the buffer is stale then it was binval'ed
+ * since last read. This doesn't matter since the
+ * caller isn't allowed to use the data anyway.
+ */
+ else if (XFS_BUF_ISSTALE(bp))
+ ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(bip != NULL);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ bip->bli_recur++;
+ trace_xfs_trans_get_buf_recur(bip);
+ return (bp);
+ }
+
+ /*
+ * We always specify the XBF_DONT_BLOCK flag within a transaction
+ * so that get_buf does not try to push out a delayed write buffer
+ * which might cause another transaction to take place (if the
+ * buffer was delayed alloc). Such recursive transactions can
+ * easily deadlock with our current transaction as well as cause
+ * us to run out of stack space.
+ */
+ bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK);
+ if (bp == NULL) {
+ return NULL;
+ }
+
+ ASSERT(!XFS_BUF_GETERROR(bp));
+
+ _xfs_trans_bjoin(tp, bp, 1);
+ trace_xfs_trans_get_buf(bp->b_fspriv);
+ return (bp);
+}
+
+/*
+ * Get and lock the superblock buffer of this file system for the
+ * given transaction.
+ *
+ * We don't need to use incore_match() here, because the superblock
+ * buffer is a private buffer which we keep a pointer to in the
+ * mount structure.
+ */
+xfs_buf_t *
+xfs_trans_getsb(xfs_trans_t *tp,
+ struct xfs_mount *mp,
+ int flags)
+{
+ xfs_buf_t *bp;
+ xfs_buf_log_item_t *bip;
+
+ /*
+ * Default to just trying to lock the superblock buffer
+ * if tp is NULL.
+ */
+ if (tp == NULL) {
+ return (xfs_getsb(mp, flags));
+ }
+
+ /*
+ * If the superblock buffer already has this transaction
+ * pointer in its b_fsprivate2 field, then we know we already
+ * have it locked. In this case we just increment the lock
+ * recursion count and return the buffer to the caller.
+ */
+ bp = mp->m_sb_bp;
+ if (XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp) {
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+ ASSERT(bip != NULL);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ bip->bli_recur++;
+ trace_xfs_trans_getsb_recur(bip);
+ return (bp);
+ }
+
+ bp = xfs_getsb(mp, flags);
+ if (bp == NULL)
+ return NULL;
+
+ _xfs_trans_bjoin(tp, bp, 1);
+ trace_xfs_trans_getsb(bp->b_fspriv);
+ return (bp);
+}
+
+#ifdef DEBUG
+xfs_buftarg_t *xfs_error_target;
+int xfs_do_error;
+int xfs_req_num;
+int xfs_error_mod = 33;
+#endif
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction. If it has not yet been
+ * read in, read it from disk. If it is already locked
+ * within the transaction and already read in, just increment its
+ * lock recursion count and return a pointer to it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * read_buf() call.
+ */
+int
+xfs_trans_read_buf(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ xfs_buftarg_t *target,
+ xfs_daddr_t blkno,
+ int len,
+ uint flags,
+ xfs_buf_t **bpp)
+{
+ xfs_buf_t *bp;
+ xfs_buf_log_item_t *bip;
+ int error;
+
+ if (flags == 0)
+ flags = XBF_LOCK | XBF_MAPPED;
+
+ /*
+ * Default to a normal get_buf() call if the tp is NULL.
+ */
+ if (tp == NULL) {
+ bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
+ if (!bp)
+ return (flags & XBF_TRYLOCK) ?
+ EAGAIN : XFS_ERROR(ENOMEM);
+
+ if (XFS_BUF_GETERROR(bp) != 0) {
+ xfs_ioerror_alert("xfs_trans_read_buf", mp,
+ bp, blkno);
+ error = XFS_BUF_GETERROR(bp);
+ xfs_buf_relse(bp);
+ return error;
+ }
+#ifdef DEBUG
+ if (xfs_do_error) {
+ if (xfs_error_target == target) {
+ if (((xfs_req_num++) % xfs_error_mod) == 0) {
+ xfs_buf_relse(bp);
+ xfs_debug(mp, "Returning error!");
+ return XFS_ERROR(EIO);
+ }
+ }
+ }
+#endif
+ if (XFS_FORCED_SHUTDOWN(mp))
+ goto shutdown_abort;
+ *bpp = bp;
+ return 0;
+ }
+
+ /*
+ * If we find the buffer in the cache with this transaction
+ * pointer in its b_fsprivate2 field, then we know we already
+ * have it locked. If it is already read in we just increment
+ * the lock recursion count and return the buffer to the caller.
+ * If the buffer is not yet read in, then we read it in, increment
+ * the lock recursion count, and return it to the caller.
+ */
+ bp = xfs_trans_buf_item_match(tp, target, blkno, len);
+ if (bp != NULL) {
+ ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+ ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+ ASSERT((XFS_BUF_ISERROR(bp)) == 0);
+ if (!(XFS_BUF_ISDONE(bp))) {
+ trace_xfs_trans_read_buf_io(bp, _RET_IP_);
+ ASSERT(!XFS_BUF_ISASYNC(bp));
+ XFS_BUF_READ(bp);
+ xfsbdstrat(tp->t_mountp, bp);
+ error = xfs_buf_iowait(bp);
+ if (error) {
+ xfs_ioerror_alert("xfs_trans_read_buf", mp,
+ bp, blkno);
+ xfs_buf_relse(bp);
+ /*
+ * We can gracefully recover from most read
+ * errors. Ones we can't are those that happen
+ * after the transaction's already dirty.
+ */
+ if (tp->t_flags & XFS_TRANS_DIRTY)
+ xfs_force_shutdown(tp->t_mountp,
+ SHUTDOWN_META_IO_ERROR);
+ return error;
+ }
+ }
+ /*
+ * We never locked this buf ourselves, so we shouldn't
+ * brelse it either. Just get out.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
+ *bpp = NULL;
+ return XFS_ERROR(EIO);
+ }
+
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+ bip->bli_recur++;
+
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ trace_xfs_trans_read_buf_recur(bip);
+ *bpp = bp;
+ return 0;
+ }
+
+ /*
+ * We always specify the XBF_DONT_BLOCK flag within a transaction
+ * so that get_buf does not try to push out a delayed write buffer
+ * which might cause another transaction to take place (if the
+ * buffer was delayed alloc). Such recursive transactions can
+ * easily deadlock with our current transaction as well as cause
+ * us to run out of stack space.
+ */
+ bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
+ if (bp == NULL) {
+ *bpp = NULL;
+ return (flags & XBF_TRYLOCK) ?
+ 0 : XFS_ERROR(ENOMEM);
+ }
+ if (XFS_BUF_GETERROR(bp) != 0) {
+ XFS_BUF_SUPER_STALE(bp);
+ error = XFS_BUF_GETERROR(bp);
+
+ xfs_ioerror_alert("xfs_trans_read_buf", mp,
+ bp, blkno);
+ if (tp->t_flags & XFS_TRANS_DIRTY)
+ xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
+ xfs_buf_relse(bp);
+ return error;
+ }
+#ifdef DEBUG
+ if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) {
+ if (xfs_error_target == target) {
+ if (((xfs_req_num++) % xfs_error_mod) == 0) {
+ xfs_force_shutdown(tp->t_mountp,
+ SHUTDOWN_META_IO_ERROR);
+ xfs_buf_relse(bp);
+ xfs_debug(mp, "Returning trans error!");
+ return XFS_ERROR(EIO);
+ }
+ }
+ }
+#endif
+ if (XFS_FORCED_SHUTDOWN(mp))
+ goto shutdown_abort;
+
+ _xfs_trans_bjoin(tp, bp, 1);
+ trace_xfs_trans_read_buf(bp->b_fspriv);
+
+ *bpp = bp;
+ return 0;
+
+shutdown_abort:
+ /*
+ * the theory here is that buffer is good but we're
+ * bailing out because the filesystem is being forcibly
+ * shut down. So we should leave the b_flags alone since
+ * the buffer's not staled and just get out.
+ */
+#if defined(DEBUG)
+ if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
+ xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
+#endif
+ ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
+ (XBF_STALE|XBF_DELWRI));
+
+ trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
+ xfs_buf_relse(bp);
+ *bpp = NULL;
+ return XFS_ERROR(EIO);
+}
+
+
+/*
+ * Release the buffer bp which was previously acquired with one of the
+ * xfs_trans_... buffer allocation routines if the buffer has not
+ * been modified within this transaction. If the buffer is modified
+ * within this transaction, do decrement the recursion count but do
+ * not release the buffer even if the count goes to 0. If the buffer is not
+ * modified within the transaction, decrement the recursion count and
+ * release the buffer if the recursion count goes to 0.
+ *
+ * If the buffer is to be released and it was not modified before
+ * this transaction began, then free the buf_log_item associated with it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * brelse() call.
+ */
+void
+xfs_trans_brelse(xfs_trans_t *tp,
+ xfs_buf_t *bp)
+{
+ xfs_buf_log_item_t *bip;
+ xfs_log_item_t *lip;
+
+ /*
+ * Default to a normal brelse() call if the tp is NULL.
+ */
+ if (tp == NULL) {
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+ /*
+ * If there's a buf log item attached to the buffer,
+ * then let the AIL know that the buffer is being
+ * unlocked.
+ */
+ if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+ lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+ if (lip->li_type == XFS_LI_BUF) {
+ bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
+ xfs_trans_unlocked_item(bip->bli_item.li_ailp,
+ lip);
+ }
+ }
+ xfs_buf_relse(bp);
+ return;
+ }
+
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ trace_xfs_trans_brelse(bip);
+
+ /*
+ * If the release is just for a recursive lock,
+ * then decrement the count and return.
+ */
+ if (bip->bli_recur > 0) {
+ bip->bli_recur--;
+ return;
+ }
+
+ /*
+ * If the buffer is dirty within this transaction, we can't
+ * release it until we commit.
+ */
+ if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
+ return;
+
+ /*
+ * If the buffer has been invalidated, then we can't release
+ * it until the transaction commits to disk unless it is re-dirtied
+ * as part of this transaction. This prevents us from pulling
+ * the item from the AIL before we should.
+ */
+ if (bip->bli_flags & XFS_BLI_STALE)
+ return;
+
+ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+
+ /*
+ * Free up the log item descriptor tracking the released item.
+ */
+ xfs_trans_del_item(&bip->bli_item);
+
+ /*
+ * Clear the hold flag in the buf log item if it is set.
+ * We wouldn't want the next user of the buffer to
+ * get confused.
+ */
+ if (bip->bli_flags & XFS_BLI_HOLD) {
+ bip->bli_flags &= ~XFS_BLI_HOLD;
+ }
+
+ /*
+ * Drop our reference to the buf log item.
+ */
+ atomic_dec(&bip->bli_refcount);
+
+ /*
+ * If the buf item is not tracking data in the log, then
+ * we must free it before releasing the buffer back to the
+ * free pool. Before releasing the buffer to the free pool,
+ * clear the transaction pointer in b_fsprivate2 to dissolve
+ * its relation to this transaction.
+ */
+ if (!xfs_buf_item_dirty(bip)) {
+/***
+ ASSERT(bp->b_pincount == 0);
+***/
+ ASSERT(atomic_read(&bip->bli_refcount) == 0);
+ ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+ ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
+ xfs_buf_item_relse(bp);
+ bip = NULL;
+ }
+ XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+
+ /*
+ * If we've still got a buf log item on the buffer, then
+ * tell the AIL that the buffer is being unlocked.
+ */
+ if (bip != NULL) {
+ xfs_trans_unlocked_item(bip->bli_item.li_ailp,
+ (xfs_log_item_t*)bip);
+ }
+
+ xfs_buf_relse(bp);
+ return;
+}
+
+/*
+ * Mark the buffer as not needing to be unlocked when the buf item's
+ * IOP_UNLOCK() routine is called. The buffer must already be locked
+ * and associated with the given transaction.
+ */
+/* ARGSUSED */
+void
+xfs_trans_bhold(xfs_trans_t *tp,
+ xfs_buf_t *bp)
+{
+ xfs_buf_log_item_t *bip;
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+ ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ bip->bli_flags |= XFS_BLI_HOLD;
+ trace_xfs_trans_bhold(bip);
+}
+
+/*
+ * Cancel the previous buffer hold request made on this buffer
+ * for this transaction.
+ */
+void
+xfs_trans_bhold_release(xfs_trans_t *tp,
+ xfs_buf_t *bp)
+{
+ xfs_buf_log_item_t *bip;
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+ ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ ASSERT(bip->bli_flags & XFS_BLI_HOLD);
+ bip->bli_flags &= ~XFS_BLI_HOLD;
+
+ trace_xfs_trans_bhold_release(bip);
+}
+
+/*
+ * This is called to mark bytes first through last inclusive of the given
+ * buffer as needing to be logged when the transaction is committed.
+ * The buffer must already be associated with the given transaction.
+ *
+ * First and last are numbers relative to the beginning of this buffer,
+ * so the first byte in the buffer is numbered 0 regardless of the
+ * value of b_blkno.
+ */
+void
+xfs_trans_log_buf(xfs_trans_t *tp,
+ xfs_buf_t *bp,
+ uint first,
+ uint last)
+{
+ xfs_buf_log_item_t *bip;
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+ ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+ ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp)));
+ ASSERT((XFS_BUF_IODONE_FUNC(bp) == NULL) ||
+ (XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks));
+
+ /*
+ * Mark the buffer as needing to be written out eventually,
+ * and set its iodone function to remove the buffer's buf log
+ * item from the AIL and free it when the buffer is flushed
+ * to disk. See xfs_buf_attach_iodone() for more details
+ * on li_cb and xfs_buf_iodone_callbacks().
+ * If we end up aborting this transaction, we trap this buffer
+ * inside the b_bdstrat callback so that this won't get written to
+ * disk.
+ */
+ XFS_BUF_DELAYWRITE(bp);
+ XFS_BUF_DONE(bp);
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
+ bip->bli_item.li_cb = xfs_buf_iodone;
+
+ trace_xfs_trans_log_buf(bip);
+
+ /*
+ * If we invalidated the buffer within this transaction, then
+ * cancel the invalidation now that we're dirtying the buffer
+ * again. There are no races with the code in xfs_buf_item_unpin(),
+ * because we have a reference to the buffer this entire time.
+ */
+ if (bip->bli_flags & XFS_BLI_STALE) {
+ bip->bli_flags &= ~XFS_BLI_STALE;
+ ASSERT(XFS_BUF_ISSTALE(bp));
+ XFS_BUF_UNSTALE(bp);
+ bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
+ }
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ bip->bli_flags |= XFS_BLI_LOGGED;
+ xfs_buf_item_log(bip, first, last);
+}
+
+
+/*
+ * This called to invalidate a buffer that is being used within
+ * a transaction. Typically this is because the blocks in the
+ * buffer are being freed, so we need to prevent it from being
+ * written out when we're done. Allowing it to be written again
+ * might overwrite data in the free blocks if they are reallocated
+ * to a file.
+ *
+ * We prevent the buffer from being written out by clearing the
+ * B_DELWRI flag. We can't always
+ * get rid of the buf log item at this point, though, because
+ * the buffer may still be pinned by another transaction. If that
+ * is the case, then we'll wait until the buffer is committed to
+ * disk for the last time (we can tell by the ref count) and
+ * free it in xfs_buf_item_unpin(). Until it is cleaned up we
+ * will keep the buffer locked so that the buffer and buf log item
+ * are not reused.
+ */
+void
+xfs_trans_binval(
+ xfs_trans_t *tp,
+ xfs_buf_t *bp)
+{
+ xfs_buf_log_item_t *bip;
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+ ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ trace_xfs_trans_binval(bip);
+
+ if (bip->bli_flags & XFS_BLI_STALE) {
+ /*
+ * If the buffer is already invalidated, then
+ * just return.
+ */
+ ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+ ASSERT(XFS_BUF_ISSTALE(bp));
+ ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
+ ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
+ ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
+ ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
+ return;
+ }
+
+ /*
+ * Clear the dirty bit in the buffer and set the STALE flag
+ * in the buf log item. The STALE flag will be used in
+ * xfs_buf_item_unpin() to determine if it should clean up
+ * when the last reference to the buf item is given up.
+ * We set the XFS_BLF_CANCEL flag in the buf log format structure
+ * and log the buf item. This will be used at recovery time
+ * to determine that copies of the buffer in the log before
+ * this should not be replayed.
+ * We mark the item descriptor and the transaction dirty so
+ * that we'll hold the buffer until after the commit.
+ *
+ * Since we're invalidating the buffer, we also clear the state
+ * about which parts of the buffer have been logged. We also
+ * clear the flag indicating that this is an inode buffer since
+ * the data in the buffer will no longer be valid.
+ *
+ * We set the stale bit in the buffer as well since we're getting
+ * rid of it.
+ */
+ XFS_BUF_UNDELAYWRITE(bp);
+ XFS_BUF_STALE(bp);
+ bip->bli_flags |= XFS_BLI_STALE;
+ bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+ bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
+ bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
+ memset((char *)(bip->bli_format.blf_data_map), 0,
+ (bip->bli_format.blf_map_size * sizeof(uint)));
+ bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY;
+}
+
+/*
+ * This call is used to indicate that the buffer contains on-disk inodes which
+ * must be handled specially during recovery. They require special handling
+ * because only the di_next_unlinked from the inodes in the buffer should be
+ * recovered. The rest of the data in the buffer is logged via the inodes
+ * themselves.
+ *
+ * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
+ * transferred to the buffer's log format structure so that we'll know what to
+ * do at recovery time.
+ */
+void
+xfs_trans_inode_buf(
+ xfs_trans_t *tp,
+ xfs_buf_t *bp)
+{
+ xfs_buf_log_item_t *bip;
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+ ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ bip->bli_flags |= XFS_BLI_INODE_BUF;
+}
+
+/*
+ * This call is used to indicate that the buffer is going to
+ * be staled and was an inode buffer. This means it gets
+ * special processing during unpin - where any inodes
+ * associated with the buffer should be removed from ail.
+ * There is also special processing during recovery,
+ * any replay of the inodes in the buffer needs to be
+ * prevented as the buffer may have been reused.
+ */
+void
+xfs_trans_stale_inode_buf(
+ xfs_trans_t *tp,
+ xfs_buf_t *bp)
+{
+ xfs_buf_log_item_t *bip;
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+ ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ bip->bli_flags |= XFS_BLI_STALE_INODE;
+ bip->bli_item.li_cb = xfs_buf_iodone;
+}
+
+/*
+ * Mark the buffer as being one which contains newly allocated
+ * inodes. We need to make sure that even if this buffer is
+ * relogged as an 'inode buf' we still recover all of the inode
+ * images in the face of a crash. This works in coordination with
+ * xfs_buf_item_committed() to ensure that the buffer remains in the
+ * AIL at its original location even after it has been relogged.
+ */
+/* ARGSUSED */
+void
+xfs_trans_inode_alloc_buf(
+ xfs_trans_t *tp,
+ xfs_buf_t *bp)
+{
+ xfs_buf_log_item_t *bip;
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+ ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+}
+
+
+/*
+ * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of
+ * dquots. However, unlike in inode buffer recovery, dquot buffers get
+ * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag).
+ * The only thing that makes dquot buffers different from regular
+ * buffers is that we must not replay dquot bufs when recovering
+ * if a _corresponding_ quotaoff has happened. We also have to distinguish
+ * between usr dquot bufs and grp dquot bufs, because usr and grp quotas
+ * can be turned off independently.
+ */
+/* ARGSUSED */
+void
+xfs_trans_dquot_buf(
+ xfs_trans_t *tp,
+ xfs_buf_t *bp,
+ uint type)
+{
+ xfs_buf_log_item_t *bip;
+
+ ASSERT(XFS_BUF_ISBUSY(bp));
+ ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+ ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+ ASSERT(type == XFS_BLF_UDQUOT_BUF ||
+ type == XFS_BLF_PDQUOT_BUF ||
+ type == XFS_BLF_GDQUOT_BUF);
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+ bip->bli_format.blf_flags |= type;
+}
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
new file mode 100644
index 00000000..f7590f5b
--- /dev/null
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_extfree_item.h"
+
+/*
+ * This routine is called to allocate an "extent free intention"
+ * log item that will hold nextents worth of extents. The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efi_log_item_t *
+xfs_trans_get_efi(xfs_trans_t *tp,
+ uint nextents)
+{
+ xfs_efi_log_item_t *efip;
+
+ ASSERT(tp != NULL);
+ ASSERT(nextents > 0);
+
+ efip = xfs_efi_init(tp->t_mountp, nextents);
+ ASSERT(efip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &efip->efi_item);
+ return efip;
+}
+
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as needing to be freed. It should
+ * be called once for each extent to be freed.
+ */
+void
+xfs_trans_log_efi_extent(xfs_trans_t *tp,
+ xfs_efi_log_item_t *efip,
+ xfs_fsblock_t start_block,
+ xfs_extlen_t ext_len)
+{
+ uint next_extent;
+ xfs_extent_t *extp;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
+ ASSERT(next_extent < efip->efi_format.efi_nextents);
+ extp = &(efip->efi_format.efi_extents[next_extent]);
+ extp->ext_start = start_block;
+ extp->ext_len = ext_len;
+}
+
+
+/*
+ * This routine is called to allocate an "extent free done"
+ * log item that will hold nextents worth of extents. The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efd_log_item_t *
+xfs_trans_get_efd(xfs_trans_t *tp,
+ xfs_efi_log_item_t *efip,
+ uint nextents)
+{
+ xfs_efd_log_item_t *efdp;
+
+ ASSERT(tp != NULL);
+ ASSERT(nextents > 0);
+
+ efdp = xfs_efd_init(tp->t_mountp, efip, nextents);
+ ASSERT(efdp != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &efdp->efd_item);
+ return efdp;
+}
+
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as having been freed. It should
+ * be called once for each extent freed.
+ */
+void
+xfs_trans_log_efd_extent(xfs_trans_t *tp,
+ xfs_efd_log_item_t *efdp,
+ xfs_fsblock_t start_block,
+ xfs_extlen_t ext_len)
+{
+ uint next_extent;
+ xfs_extent_t *extp;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+ next_extent = efdp->efd_next_extent;
+ ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ extp = &(efdp->efd_format.efd_extents[next_extent]);
+ extp->ext_start = start_block;
+ extp->ext_len = ext_len;
+ efdp->efd_next_extent++;
+}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
new file mode 100644
index 00000000..048b0c68
--- /dev/null
+++ b/fs/xfs/xfs_trans_inode.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+
+#ifdef XFS_TRANS_DEBUG
+STATIC void
+xfs_trans_inode_broot_debug(
+ xfs_inode_t *ip);
+#else
+#define xfs_trans_inode_broot_debug(ip)
+#endif
+
+/*
+ * Add a locked inode to the transaction.
+ *
+ * The inode must be locked, and it cannot be associated with any transaction.
+ */
+void
+xfs_trans_ijoin(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ xfs_inode_log_item_t *iip;
+
+ ASSERT(ip->i_transp == NULL);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (ip->i_itemp == NULL)
+ xfs_inode_item_init(ip, ip->i_mount);
+ iip = ip->i_itemp;
+ ASSERT(iip->ili_lock_flags == 0);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &iip->ili_item);
+
+ xfs_trans_inode_broot_debug(ip);
+
+ /*
+ * Initialize i_transp so we can find it with xfs_inode_incore()
+ * in xfs_trans_iget() above.
+ */
+ ip->i_transp = tp;
+}
+
+/*
+ * Add a locked inode to the transaction.
+ *
+ *
+ * Grabs a reference to the inode which will be dropped when the transaction
+ * is committed. The inode will also be unlocked at that point. The inode
+ * must be locked, and it cannot be associated with any transaction.
+ */
+void
+xfs_trans_ijoin_ref(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ uint lock_flags)
+{
+ xfs_trans_ijoin(tp, ip);
+ IHOLD(ip);
+ ip->i_itemp->ili_lock_flags = lock_flags;
+}
+
+/*
+ * Transactional inode timestamp update. Requires the inode to be locked and
+ * joined to the transaction supplied. Relies on the transaction subsystem to
+ * track dirty state and update/writeback the inode accordingly.
+ */
+void
+xfs_trans_ichgtime(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int flags)
+{
+ struct inode *inode = VFS_I(ip);
+ timespec_t tv;
+
+ ASSERT(tp);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(ip->i_transp == tp);
+
+ tv = current_fs_time(inode->i_sb);
+
+ if ((flags & XFS_ICHGTIME_MOD) &&
+ !timespec_equal(&inode->i_mtime, &tv)) {
+ inode->i_mtime = tv;
+ }
+ if ((flags & XFS_ICHGTIME_CHG) &&
+ !timespec_equal(&inode->i_ctime, &tv)) {
+ inode->i_ctime = tv;
+ }
+}
+
+/*
+ * This is called to mark the fields indicated in fieldmask as needing
+ * to be logged when the transaction is committed. The inode must
+ * already be associated with the given transaction.
+ *
+ * The values for fieldmask are defined in xfs_inode_item.h. We always
+ * log all of the core inode if any of it has changed, and we always log
+ * all of the inline data/extents/b-tree root if any of them has changed.
+ */
+void
+xfs_trans_log_inode(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip,
+ uint flags)
+{
+ ASSERT(ip->i_transp == tp);
+ ASSERT(ip->i_itemp != NULL);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+ /*
+ * Always OR in the bits from the ili_last_fields field.
+ * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
+ * routines in the eventual clearing of the ilf_fields bits.
+ * See the big comment in xfs_iflush() for an explanation of
+ * this coordination mechanism.
+ */
+ flags |= ip->i_itemp->ili_last_fields;
+ ip->i_itemp->ili_format.ilf_fields |= flags;
+}
+
+#ifdef XFS_TRANS_DEBUG
+/*
+ * Keep track of the state of the inode btree root to make sure we
+ * log it properly.
+ */
+STATIC void
+xfs_trans_inode_broot_debug(
+ xfs_inode_t *ip)
+{
+ xfs_inode_log_item_t *iip;
+
+ ASSERT(ip->i_itemp != NULL);
+ iip = ip->i_itemp;
+ if (iip->ili_root_size != 0) {
+ ASSERT(iip->ili_orig_root != NULL);
+ kmem_free(iip->ili_orig_root);
+ iip->ili_root_size = 0;
+ iip->ili_orig_root = NULL;
+ }
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ ASSERT((ip->i_df.if_broot != NULL) &&
+ (ip->i_df.if_broot_bytes > 0));
+ iip->ili_root_size = ip->i_df.if_broot_bytes;
+ iip->ili_orig_root =
+ (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP);
+ memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot),
+ iip->ili_root_size);
+ }
+}
+#endif
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
new file mode 100644
index 00000000..fe2e3cbc
--- /dev/null
+++ b/fs/xfs/xfs_trans_priv.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_TRANS_PRIV_H__
+#define __XFS_TRANS_PRIV_H__
+
+struct xfs_log_item;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_ail;
+struct xfs_log_vec;
+
+void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
+void xfs_trans_del_item(struct xfs_log_item *);
+void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
+ int flags);
+void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
+
+void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
+ xfs_lsn_t commit_lsn, int aborted);
+/*
+ * AIL traversal cursor.
+ *
+ * Rather than using a generation number for detecting changes in the ail, use
+ * a cursor that is protected by the ail lock. The aild cursor exists in the
+ * struct xfs_ail, but other traversals can declare it on the stack and link it
+ * to the ail list.
+ *
+ * When an object is deleted from or moved int the AIL, the cursor list is
+ * searched to see if the object is a designated cursor item. If it is, it is
+ * deleted from the cursor so that the next time the cursor is used traversal
+ * will return to the start.
+ *
+ * This means a traversal colliding with a removal will cause a restart of the
+ * list scan, rather than any insertion or deletion anywhere in the list. The
+ * low bit of the item pointer is set if the cursor has been invalidated so
+ * that we can tell the difference between invalidation and reaching the end
+ * of the list to trigger traversal restarts.
+ */
+struct xfs_ail_cursor {
+ struct xfs_ail_cursor *next;
+ struct xfs_log_item *item;
+};
+
+/*
+ * Private AIL structures.
+ *
+ * Eventually we need to drive the locking in here as well.
+ */
+struct xfs_ail {
+ struct xfs_mount *xa_mount;
+ struct task_struct *xa_task;
+ struct list_head xa_ail;
+ xfs_lsn_t xa_target;
+ struct xfs_ail_cursor xa_cursors;
+ spinlock_t xa_lock;
+ xfs_lsn_t xa_last_pushed_lsn;
+};
+
+/*
+ * From xfs_trans_ail.c
+ */
+void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ struct xfs_log_item **log_items, int nr_items,
+ xfs_lsn_t lsn) __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_update(
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn) __releases(ailp->xa_lock)
+{
+ xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
+}
+
+void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
+ struct xfs_log_item **log_items, int nr_items)
+ __releases(ailp->xa_lock);
+static inline void
+xfs_trans_ail_delete(
+ struct xfs_ail *ailp,
+ xfs_log_item_t *lip) __releases(ailp->xa_lock)
+{
+ xfs_trans_ail_delete_bulk(ailp, &lip, 1);
+}
+
+void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
+void xfs_ail_push_all(struct xfs_ail *);
+xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp);
+
+void xfs_trans_unlocked_item(struct xfs_ail *,
+ xfs_log_item_t *);
+
+struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ xfs_lsn_t lsn);
+struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ xfs_lsn_t lsn);
+struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur);
+void xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur);
+
+#if BITS_PER_LONG != 64
+static inline void
+xfs_trans_ail_copy_lsn(
+ struct xfs_ail *ailp,
+ xfs_lsn_t *dst,
+ xfs_lsn_t *src)
+{
+ ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+ spin_lock(&ailp->xa_lock);
+ *dst = *src;
+ spin_unlock(&ailp->xa_lock);
+}
+#else
+static inline void
+xfs_trans_ail_copy_lsn(
+ struct xfs_ail *ailp,
+ xfs_lsn_t *dst,
+ xfs_lsn_t *src)
+{
+ ASSERT(sizeof(xfs_lsn_t) == 8);
+ *dst = *src;
+}
+#endif
+#endif /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
new file mode 100644
index 00000000..7d2c920d
--- /dev/null
+++ b/fs/xfs/xfs_trans_space.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_TRANS_SPACE_H__
+#define __XFS_TRANS_SPACE_H__
+
+/*
+ * Components of space reservations.
+ */
+#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \
+ (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
+#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1)
+#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
+ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+ XFS_EXTENTADD_SPACE_RES(mp,w))
+#define XFS_DAENTER_1B(mp,w) ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
+#define XFS_DAENTER_DBS(mp,w) \
+ (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
+#define XFS_DAENTER_BLOCKS(mp,w) \
+ (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
+#define XFS_DAENTER_BMAP1B(mp,w) \
+ XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
+#define XFS_DAENTER_BMAPS(mp,w) \
+ (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
+#define XFS_DAENTER_SPACE_RES(mp,w) \
+ (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
+#define XFS_DAREMOVE_SPACE_RES(mp,w) XFS_DAENTER_BMAPS(mp,w)
+#define XFS_DIRENTER_MAX_SPLIT(mp,nl) 1
+#define XFS_DIRENTER_SPACE_RES(mp,nl) \
+ (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
+ XFS_DIRENTER_MAX_SPLIT(mp,nl))
+#define XFS_DIRREMOVE_SPACE_RES(mp) \
+ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
+#define XFS_IALLOC_SPACE_RES(mp) \
+ (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
+
+/*
+ * Space reservation values for various transactions.
+ */
+#define XFS_ADDAFORK_SPACE_RES(mp) \
+ ((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
+#define XFS_ATTRRM_SPACE_RES(mp) \
+ XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
+/* This macro is not used - see inline code in xfs_attr_set */
+#define XFS_ATTRSET_SPACE_RES(mp, v) \
+ (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
+#define XFS_CREATE_SPACE_RES(mp,nl) \
+ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
+ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
+#define XFS_GROWFS_SPACE_RES(mp) \
+ (2 * XFS_AG_MAXLEVELS(mp))
+#define XFS_GROWFSRT_SPACE_RES(mp,b) \
+ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
+#define XFS_LINK_SPACE_RES(mp,nl) \
+ XFS_DIRENTER_SPACE_RES(mp,nl)
+#define XFS_MKDIR_SPACE_RES(mp,nl) \
+ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_QM_DQALLOC_SPACE_RES(mp) \
+ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
+ XFS_DQUOT_CLUSTER_SIZE_FSB)
+#define XFS_QM_QINOCREATE_SPACE_RES(mp) \
+ XFS_IALLOC_SPACE_RES(mp)
+#define XFS_REMOVE_SPACE_RES(mp) \
+ XFS_DIRREMOVE_SPACE_RES(mp)
+#define XFS_RENAME_SPACE_RES(mp,nl) \
+ (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
+ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+
+#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
new file mode 100644
index 00000000..65584b55
--- /dev/null
+++ b/fs/xfs/xfs_types.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_TYPES_H__
+#define __XFS_TYPES_H__
+
+#ifdef __KERNEL__
+
+/*
+ * Additional type declarations for XFS
+ */
+typedef signed char __int8_t;
+typedef unsigned char __uint8_t;
+typedef signed short int __int16_t;
+typedef unsigned short int __uint16_t;
+typedef signed int __int32_t;
+typedef unsigned int __uint32_t;
+typedef signed long long int __int64_t;
+typedef unsigned long long int __uint64_t;
+
+typedef enum { B_FALSE,B_TRUE } boolean_t;
+typedef __uint32_t prid_t; /* project ID */
+typedef __uint32_t inst_t; /* an instruction */
+
+typedef __s64 xfs_off_t; /* <file offset> type */
+typedef unsigned long long xfs_ino_t; /* <inode> type */
+typedef __s64 xfs_daddr_t; /* <disk address> type */
+typedef char * xfs_caddr_t; /* <core address> type */
+typedef __u32 xfs_dev_t;
+typedef __u32 xfs_nlink_t;
+
+/* __psint_t is the same size as a pointer */
+#if (BITS_PER_LONG == 32)
+typedef __int32_t __psint_t;
+typedef __uint32_t __psunsigned_t;
+#elif (BITS_PER_LONG == 64)
+typedef __int64_t __psint_t;
+typedef __uint64_t __psunsigned_t;
+#else
+#error BITS_PER_LONG must be 32 or 64
+#endif
+
+#endif /* __KERNEL__ */
+
+typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */
+typedef __uint32_t xfs_extlen_t; /* extent length in blocks */
+typedef __uint32_t xfs_agnumber_t; /* allocation group number */
+typedef __int32_t xfs_extnum_t; /* # of extents in a file */
+typedef __int16_t xfs_aextnum_t; /* # extents in an attribute fork */
+typedef __int64_t xfs_fsize_t; /* bytes in a file */
+typedef __uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
+
+typedef __int32_t xfs_suminfo_t; /* type of bitmap summary info */
+typedef __int32_t xfs_rtword_t; /* word type for bitmap manipulations */
+
+typedef __int64_t xfs_lsn_t; /* log sequence number */
+typedef __int32_t xfs_tid_t; /* transaction identifier */
+
+typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
+typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
+
+/*
+ * These types are 64 bits on disk but are either 32 or 64 bits in memory.
+ * Disk based types:
+ */
+typedef __uint64_t xfs_dfsbno_t; /* blockno in filesystem (agno|agbno) */
+typedef __uint64_t xfs_drfsbno_t; /* blockno in filesystem (raw) */
+typedef __uint64_t xfs_drtbno_t; /* extent (block) in realtime area */
+typedef __uint64_t xfs_dfiloff_t; /* block number in a file */
+typedef __uint64_t xfs_dfilblks_t; /* number of blocks in a file */
+
+/*
+ * Memory based types are conditional.
+ */
+#if XFS_BIG_BLKNOS
+typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
+typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
+typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
+typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+#else
+typedef __uint32_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
+typedef __uint32_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
+typedef __uint32_t xfs_rtblock_t; /* extent (block) in realtime area */
+typedef __int32_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+#endif
+typedef __uint64_t xfs_fileoff_t; /* block number in a file */
+typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
+typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
+
+/*
+ * Null values for the types.
+ */
+#define NULLDFSBNO ((xfs_dfsbno_t)-1)
+#define NULLDRFSBNO ((xfs_drfsbno_t)-1)
+#define NULLDRTBNO ((xfs_drtbno_t)-1)
+#define NULLDFILOFF ((xfs_dfiloff_t)-1)
+
+#define NULLFSBLOCK ((xfs_fsblock_t)-1)
+#define NULLRFSBLOCK ((xfs_rfsblock_t)-1)
+#define NULLRTBLOCK ((xfs_rtblock_t)-1)
+#define NULLFILEOFF ((xfs_fileoff_t)-1)
+
+#define NULLAGBLOCK ((xfs_agblock_t)-1)
+#define NULLAGNUMBER ((xfs_agnumber_t)-1)
+#define NULLEXTNUM ((xfs_extnum_t)-1)
+
+#define NULLCOMMITLSN ((xfs_lsn_t)-1)
+
+/*
+ * Max values for extlen, extnum, aextnum.
+ */
+#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */
+#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */
+#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */
+
+/*
+ * Min numbers of data/attr fork btree root pointers.
+ */
+#define MINDBTPTRS 3
+#define MINABTPTRS 2
+
+/*
+ * MAXNAMELEN is the length (including the terminating null) of
+ * the longest permissible file (component) name.
+ */
+#define MAXNAMELEN 256
+
+typedef enum {
+ XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
+} xfs_lookup_t;
+
+typedef enum {
+ XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
+ XFS_BTNUM_MAX
+} xfs_btnum_t;
+
+struct xfs_name {
+ const unsigned char *name;
+ int len;
+};
+
+#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
new file mode 100644
index 00000000..8b32d1a4
--- /dev/null
+++ b/fs/xfs/xfs_utils.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_itable.h"
+#include "xfs_utils.h"
+
+
+/*
+ * Allocates a new inode from disk and return a pointer to the
+ * incore copy. This routine will internally commit the current
+ * transaction and allocate a new one if the Space Manager needed
+ * to do an allocation to replenish the inode free-list.
+ *
+ * This routine is designed to be called from xfs_create and
+ * xfs_create_dir.
+ *
+ */
+int
+xfs_dir_ialloc(
+ xfs_trans_t **tpp, /* input: current transaction;
+ output: may be a new transaction. */
+ xfs_inode_t *dp, /* directory within whose allocate
+ the inode. */
+ mode_t mode,
+ xfs_nlink_t nlink,
+ xfs_dev_t rdev,
+ prid_t prid, /* project id */
+ int okalloc, /* ok to allocate new space */
+ xfs_inode_t **ipp, /* pointer to inode; it will be
+ locked. */
+ int *committed)
+
+{
+ xfs_trans_t *tp;
+ xfs_trans_t *ntp;
+ xfs_inode_t *ip;
+ xfs_buf_t *ialloc_context = NULL;
+ boolean_t call_again = B_FALSE;
+ int code;
+ uint log_res;
+ uint log_count;
+ void *dqinfo;
+ uint tflags;
+
+ tp = *tpp;
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ /*
+ * xfs_ialloc will return a pointer to an incore inode if
+ * the Space Manager has an available inode on the free
+ * list. Otherwise, it will do an allocation and replenish
+ * the freelist. Since we can only do one allocation per
+ * transaction without deadlocks, we will need to commit the
+ * current transaction and start a new one. We will then
+ * need to call xfs_ialloc again to get the inode.
+ *
+ * If xfs_ialloc did an allocation to replenish the freelist,
+ * it returns the bp containing the head of the freelist as
+ * ialloc_context. We will hold a lock on it across the
+ * transaction commit so that no other process can steal
+ * the inode(s) that we've just allocated.
+ */
+ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
+ &ialloc_context, &call_again, &ip);
+
+ /*
+ * Return an error if we were unable to allocate a new inode.
+ * This should only happen if we run out of space on disk or
+ * encounter a disk error.
+ */
+ if (code) {
+ *ipp = NULL;
+ return code;
+ }
+ if (!call_again && (ip == NULL)) {
+ *ipp = NULL;
+ return XFS_ERROR(ENOSPC);
+ }
+
+ /*
+ * If call_again is set, then we were unable to get an
+ * inode in one operation. We need to commit the current
+ * transaction and call xfs_ialloc() again. It is guaranteed
+ * to succeed the second time.
+ */
+ if (call_again) {
+
+ /*
+ * Normally, xfs_trans_commit releases all the locks.
+ * We call bhold to hang on to the ialloc_context across
+ * the commit. Holding this buffer prevents any other
+ * processes from doing any allocations in this
+ * allocation group.
+ */
+ xfs_trans_bhold(tp, ialloc_context);
+ /*
+ * Save the log reservation so we can use
+ * them in the next transaction.
+ */
+ log_res = xfs_trans_get_log_res(tp);
+ log_count = xfs_trans_get_log_count(tp);
+
+ /*
+ * We want the quota changes to be associated with the next
+ * transaction, NOT this one. So, detach the dqinfo from this
+ * and attach it to the next transaction.
+ */
+ dqinfo = NULL;
+ tflags = 0;
+ if (tp->t_dqinfo) {
+ dqinfo = (void *)tp->t_dqinfo;
+ tp->t_dqinfo = NULL;
+ tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
+ tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
+ }
+
+ ntp = xfs_trans_dup(tp);
+ code = xfs_trans_commit(tp, 0);
+ tp = ntp;
+ if (committed != NULL) {
+ *committed = 1;
+ }
+ /*
+ * If we get an error during the commit processing,
+ * release the buffer that is still held and return
+ * to the caller.
+ */
+ if (code) {
+ xfs_buf_relse(ialloc_context);
+ if (dqinfo) {
+ tp->t_dqinfo = dqinfo;
+ xfs_trans_free_dqinfo(tp);
+ }
+ *tpp = ntp;
+ *ipp = NULL;
+ return code;
+ }
+
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(tp->t_ticket);
+ code = xfs_trans_reserve(tp, 0, log_res, 0,
+ XFS_TRANS_PERM_LOG_RES, log_count);
+ /*
+ * Re-attach the quota info that we detached from prev trx.
+ */
+ if (dqinfo) {
+ tp->t_dqinfo = dqinfo;
+ tp->t_flags |= tflags;
+ }
+
+ if (code) {
+ xfs_buf_relse(ialloc_context);
+ *tpp = ntp;
+ *ipp = NULL;
+ return code;
+ }
+ xfs_trans_bjoin(tp, ialloc_context);
+
+ /*
+ * Call ialloc again. Since we've locked out all
+ * other allocations in this allocation group,
+ * this call should always succeed.
+ */
+ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
+ okalloc, &ialloc_context, &call_again, &ip);
+
+ /*
+ * If we get an error at this point, return to the caller
+ * so that the current transaction can be aborted.
+ */
+ if (code) {
+ *tpp = tp;
+ *ipp = NULL;
+ return code;
+ }
+ ASSERT ((!call_again) && (ip != NULL));
+
+ } else {
+ if (committed != NULL) {
+ *committed = 0;
+ }
+ }
+
+ *ipp = ip;
+ *tpp = tp;
+
+ return 0;
+}
+
+/*
+ * Decrement the link count on an inode & log the change.
+ * If this causes the link count to go to zero, initiate the
+ * logging activity required to truncate a file.
+ */
+int /* error */
+xfs_droplink(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip)
+{
+ int error;
+
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+ ASSERT (ip->i_d.di_nlink > 0);
+ ip->i_d.di_nlink--;
+ drop_nlink(VFS_I(ip));
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = 0;
+ if (ip->i_d.di_nlink == 0) {
+ /*
+ * We're dropping the last link to this file.
+ * Move the on-disk inode to the AGI unlinked list.
+ * From xfs_inactive() we will pull the inode from
+ * the list and free it.
+ */
+ error = xfs_iunlink(tp, ip);
+ }
+ return error;
+}
+
+/*
+ * This gets called when the inode's version needs to be changed from 1 to 2.
+ * Currently this happens when the nlink field overflows the old 16-bit value
+ * or when chproj is called to change the project for the first time.
+ * As a side effect the superblock version will also get rev'd
+ * to contain the NLINK bit.
+ */
+void
+xfs_bump_ino_vers2(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip)
+{
+ xfs_mount_t *mp;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(ip->i_d.di_version == 1);
+
+ ip->i_d.di_version = 2;
+ ip->i_d.di_onlink = 0;
+ memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+ mp = tp->t_mountp;
+ if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+ spin_lock(&mp->m_sb_lock);
+ if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+ xfs_sb_version_addnlink(&mp->m_sb);
+ spin_unlock(&mp->m_sb_lock);
+ xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
+ } else {
+ spin_unlock(&mp->m_sb_lock);
+ }
+ }
+ /* Caller must log the inode */
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+int
+xfs_bumplink(
+ xfs_trans_t *tp,
+ xfs_inode_t *ip)
+{
+ if (ip->i_d.di_nlink >= XFS_MAXLINK)
+ return XFS_ERROR(EMLINK);
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+ ASSERT(ip->i_d.di_nlink > 0);
+ ip->i_d.di_nlink++;
+ inc_nlink(VFS_I(ip));
+ if ((ip->i_d.di_version == 1) &&
+ (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
+ /*
+ * The inode has increased its number of links beyond
+ * what can fit in an old format inode. It now needs
+ * to be converted to a version 2 inode with a 32 bit
+ * link count. If this is the first inode in the file
+ * system to do this, then we need to bump the superblock
+ * version number as well.
+ */
+ xfs_bump_ino_vers2(tp, ip);
+ }
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
new file mode 100644
index 00000000..456fca31
--- /dev/null
+++ b/fs/xfs/xfs_utils.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_UTILS_H__
+#define __XFS_UTILS_H__
+
+extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
+ xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
+extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
+extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
+extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
+
+#endif /* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
new file mode 100644
index 00000000..59509ae0
--- /dev/null
+++ b/fs/xfs/xfs_vnodeops.c
@@ -0,0 +1,2852 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_itable.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
+#include "xfs_rw.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_utils.h"
+#include "xfs_rtalloc.h"
+#include "xfs_trans_space.h"
+#include "xfs_log_priv.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+
+int
+xfs_setattr(
+ struct xfs_inode *ip,
+ struct iattr *iattr,
+ int flags)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+ int mask = iattr->ia_valid;
+ xfs_trans_t *tp;
+ int code;
+ uint lock_flags;
+ uint commit_flags=0;
+ uid_t uid=0, iuid=0;
+ gid_t gid=0, igid=0;
+ struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
+ int need_iolock = 1;
+
+ trace_xfs_setattr(ip);
+
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return XFS_ERROR(EROFS);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ code = -inode_change_ok(inode, iattr);
+ if (code)
+ return code;
+
+ olddquot1 = olddquot2 = NULL;
+ udqp = gdqp = NULL;
+
+ /*
+ * If disk quotas is on, we make sure that the dquots do exist on disk,
+ * before we start any other transactions. Trying to do this later
+ * is messy. We don't care to take a readlock to look at the ids
+ * in inode here, because we can't hold it across the trans_reserve.
+ * If the IDs do change before we take the ilock, we're covered
+ * because the i_*dquot fields will get updated anyway.
+ */
+ if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
+ uint qflags = 0;
+
+ if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
+ uid = iattr->ia_uid;
+ qflags |= XFS_QMOPT_UQUOTA;
+ } else {
+ uid = ip->i_d.di_uid;
+ }
+ if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
+ gid = iattr->ia_gid;
+ qflags |= XFS_QMOPT_GQUOTA;
+ } else {
+ gid = ip->i_d.di_gid;
+ }
+
+ /*
+ * We take a reference when we initialize udqp and gdqp,
+ * so it is important that we never blindly double trip on
+ * the same variable. See xfs_create() for an example.
+ */
+ ASSERT(udqp == NULL);
+ ASSERT(gdqp == NULL);
+ code = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
+ qflags, &udqp, &gdqp);
+ if (code)
+ return code;
+ }
+
+ /*
+ * For the other attributes, we acquire the inode lock and
+ * first do an error checking pass.
+ */
+ tp = NULL;
+ lock_flags = XFS_ILOCK_EXCL;
+ if (flags & XFS_ATTR_NOLOCK)
+ need_iolock = 0;
+ if (!(mask & ATTR_SIZE)) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+ commit_flags = 0;
+ code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp),
+ 0, 0, 0);
+ if (code) {
+ lock_flags = 0;
+ goto error_return;
+ }
+ } else {
+ if (need_iolock)
+ lock_flags |= XFS_IOLOCK_EXCL;
+ }
+
+ xfs_ilock(ip, lock_flags);
+
+ /*
+ * Change file ownership. Must be the owner or privileged.
+ */
+ if (mask & (ATTR_UID|ATTR_GID)) {
+ /*
+ * These IDs could have changed since we last looked at them.
+ * But, we're assured that if the ownership did change
+ * while we didn't have the inode locked, inode's dquot(s)
+ * would have changed also.
+ */
+ iuid = ip->i_d.di_uid;
+ igid = ip->i_d.di_gid;
+ gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
+ uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
+
+ /*
+ * Do a quota reservation only if uid/gid is actually
+ * going to change.
+ */
+ if (XFS_IS_QUOTA_RUNNING(mp) &&
+ ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+ (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
+ ASSERT(tp);
+ code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
+ capable(CAP_FOWNER) ?
+ XFS_QMOPT_FORCE_RES : 0);
+ if (code) /* out of quota */
+ goto error_return;
+ }
+ }
+
+ /*
+ * Truncate file. Must have write permission and not be a directory.
+ */
+ if (mask & ATTR_SIZE) {
+ /* Short circuit the truncate case for zero length files */
+ if (iattr->ia_size == 0 &&
+ ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ lock_flags &= ~XFS_ILOCK_EXCL;
+ if (mask & ATTR_CTIME) {
+ inode->i_mtime = inode->i_ctime =
+ current_fs_time(inode->i_sb);
+ xfs_mark_inode_dirty_sync(ip);
+ }
+ code = 0;
+ goto error_return;
+ }
+
+ if (S_ISDIR(ip->i_d.di_mode)) {
+ code = XFS_ERROR(EISDIR);
+ goto error_return;
+ } else if (!S_ISREG(ip->i_d.di_mode)) {
+ code = XFS_ERROR(EINVAL);
+ goto error_return;
+ }
+
+ /*
+ * Make sure that the dquots are attached to the inode.
+ */
+ code = xfs_qm_dqattach_locked(ip, 0);
+ if (code)
+ goto error_return;
+
+ /*
+ * Now we can make the changes. Before we join the inode
+ * to the transaction, if ATTR_SIZE is set then take care of
+ * the part of the truncation that must be done without the
+ * inode lock. This needs to be done before joining the inode
+ * to the transaction, because the inode cannot be unlocked
+ * once it is a part of the transaction.
+ */
+ if (iattr->ia_size > ip->i_size) {
+ /*
+ * Do the first part of growing a file: zero any data
+ * in the last block that is beyond the old EOF. We
+ * need to do this before the inode is joined to the
+ * transaction to modify the i_size.
+ */
+ code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+ if (code)
+ goto error_return;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ lock_flags &= ~XFS_ILOCK_EXCL;
+
+ /*
+ * We are going to log the inode size change in this
+ * transaction so any previous writes that are beyond the on
+ * disk EOF and the new EOF that have not been written out need
+ * to be written here. If we do not write the data out, we
+ * expose ourselves to the null files problem.
+ *
+ * Only flush from the on disk size to the smaller of the in
+ * memory file size or the new size as that's the range we
+ * really care about here and prevents waiting for other data
+ * not within the range we care about here.
+ */
+ if (ip->i_size != ip->i_d.di_size &&
+ iattr->ia_size > ip->i_d.di_size) {
+ code = xfs_flush_pages(ip,
+ ip->i_d.di_size, iattr->ia_size,
+ XBF_ASYNC, FI_NONE);
+ if (code)
+ goto error_return;
+ }
+
+ /* wait for all I/O to complete */
+ xfs_ioend_wait(ip);
+
+ code = -block_truncate_page(inode->i_mapping, iattr->ia_size,
+ xfs_get_blocks);
+ if (code)
+ goto error_return;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
+ code = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_ITRUNCATE_LOG_COUNT);
+ if (code)
+ goto error_return;
+
+ truncate_setsize(inode, iattr->ia_size);
+
+ commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+ lock_flags |= XFS_ILOCK_EXCL;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin(tp, ip);
+
+ /*
+ * Only change the c/mtime if we are changing the size
+ * or we are explicitly asked to change it. This handles
+ * the semantic difference between truncate() and ftruncate()
+ * as implemented in the VFS.
+ *
+ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME
+ * is a special case where we need to update the times despite
+ * not having these flags set. For all other operations the
+ * VFS set these flags explicitly if it wants a timestamp
+ * update.
+ */
+ if (iattr->ia_size != ip->i_size &&
+ (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+ iattr->ia_ctime = iattr->ia_mtime =
+ current_fs_time(inode->i_sb);
+ mask |= ATTR_CTIME | ATTR_MTIME;
+ }
+
+ if (iattr->ia_size > ip->i_size) {
+ ip->i_d.di_size = iattr->ia_size;
+ ip->i_size = iattr->ia_size;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ } else if (iattr->ia_size <= ip->i_size ||
+ (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
+ /*
+ * signal a sync transaction unless
+ * we're truncating an already unlinked
+ * file on a wsync filesystem
+ */
+ code = xfs_itruncate_finish(&tp, ip, iattr->ia_size,
+ XFS_DATA_FORK,
+ ((ip->i_d.di_nlink != 0 ||
+ !(mp->m_flags & XFS_MOUNT_WSYNC))
+ ? 1 : 0));
+ if (code)
+ goto abort_return;
+ /*
+ * Truncated "down", so we're removing references
+ * to old data here - if we now delay flushing for
+ * a long time, we expose ourselves unduly to the
+ * notorious NULL files problem. So, we mark this
+ * vnode and flush it when the file is closed, and
+ * do not wait the usual (long) time for writeout.
+ */
+ xfs_iflags_set(ip, XFS_ITRUNCATED);
+ }
+ } else if (tp) {
+ xfs_trans_ijoin(tp, ip);
+ }
+
+ /*
+ * Change file ownership. Must be the owner or privileged.
+ */
+ if (mask & (ATTR_UID|ATTR_GID)) {
+ /*
+ * CAP_FSETID overrides the following restrictions:
+ *
+ * The set-user-ID and set-group-ID bits of a file will be
+ * cleared upon successful return from chown()
+ */
+ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+ !capable(CAP_FSETID)) {
+ ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+ }
+
+ /*
+ * Change the ownerships and register quota modifications
+ * in the transaction.
+ */
+ if (iuid != uid) {
+ if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
+ ASSERT(mask & ATTR_UID);
+ ASSERT(udqp);
+ olddquot1 = xfs_qm_vop_chown(tp, ip,
+ &ip->i_udquot, udqp);
+ }
+ ip->i_d.di_uid = uid;
+ inode->i_uid = uid;
+ }
+ if (igid != gid) {
+ if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
+ ASSERT(!XFS_IS_PQUOTA_ON(mp));
+ ASSERT(mask & ATTR_GID);
+ ASSERT(gdqp);
+ olddquot2 = xfs_qm_vop_chown(tp, ip,
+ &ip->i_gdquot, gdqp);
+ }
+ ip->i_d.di_gid = gid;
+ inode->i_gid = gid;
+ }
+ }
+
+ /*
+ * Change file access modes.
+ */
+ if (mask & ATTR_MODE) {
+ umode_t mode = iattr->ia_mode;
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ mode &= ~S_ISGID;
+
+ ip->i_d.di_mode &= S_IFMT;
+ ip->i_d.di_mode |= mode & ~S_IFMT;
+
+ inode->i_mode &= S_IFMT;
+ inode->i_mode |= mode & ~S_IFMT;
+ }
+
+ /*
+ * Change file access or modified times.
+ */
+ if (mask & ATTR_ATIME) {
+ inode->i_atime = iattr->ia_atime;
+ ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+ ip->i_update_core = 1;
+ }
+ if (mask & ATTR_CTIME) {
+ inode->i_ctime = iattr->ia_ctime;
+ ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+ ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+ ip->i_update_core = 1;
+ }
+ if (mask & ATTR_MTIME) {
+ inode->i_mtime = iattr->ia_mtime;
+ ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+ ip->i_update_core = 1;
+ }
+
+ /*
+ * And finally, log the inode core if any attribute in it
+ * has been changed.
+ */
+ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE|
+ ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ XFS_STATS_INC(xs_ig_attrchg);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * transaction goes to disk before returning to the user.
+ * This is slightly sub-optimal in that truncates require
+ * two sync transactions instead of one for wsync filesystems.
+ * One for the truncate and one for the timestamps since we
+ * don't want to change the timestamps unless we're sure the
+ * truncate worked. Truncates are less than 1% of the laddis
+ * mix so this probably isn't worth the trouble to optimize.
+ */
+ code = 0;
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(tp);
+
+ code = xfs_trans_commit(tp, commit_flags);
+
+ xfs_iunlock(ip, lock_flags);
+
+ /*
+ * Release any dquot(s) the inode had kept before chown.
+ */
+ xfs_qm_dqrele(olddquot1);
+ xfs_qm_dqrele(olddquot2);
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+
+ if (code)
+ return code;
+
+ /*
+ * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
+ * update. We could avoid this with linked transactions
+ * and passing down the transaction pointer all the way
+ * to attr_set. No previous user of the generic
+ * Posix ACL code seems to care about this issue either.
+ */
+ if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
+ code = -xfs_acl_chmod(inode);
+ if (code)
+ return XFS_ERROR(code);
+ }
+
+ return 0;
+
+ abort_return:
+ commit_flags |= XFS_TRANS_ABORT;
+ error_return:
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ if (tp) {
+ xfs_trans_cancel(tp, commit_flags);
+ }
+ if (lock_flags != 0) {
+ xfs_iunlock(ip, lock_flags);
+ }
+ return code;
+}
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 2 extents back from
+ * bmapi.
+ */
+#define SYMLINK_MAPS 2
+
+STATIC int
+xfs_readlink_bmap(
+ xfs_inode_t *ip,
+ char *link)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ int pathlen = ip->i_d.di_size;
+ int nmaps = SYMLINK_MAPS;
+ xfs_bmbt_irec_t mval[SYMLINK_MAPS];
+ xfs_daddr_t d;
+ int byte_cnt;
+ int n;
+ xfs_buf_t *bp;
+ int error = 0;
+
+ error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), 0, NULL, 0,
+ mval, &nmaps, NULL);
+ if (error)
+ goto out;
+
+ for (n = 0; n < nmaps; n++) {
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+
+ bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt),
+ XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK);
+ error = XFS_BUF_GETERROR(bp);
+ if (error) {
+ xfs_ioerror_alert("xfs_readlink",
+ ip->i_mount, bp, XFS_BUF_ADDR(bp));
+ xfs_buf_relse(bp);
+ goto out;
+ }
+ if (pathlen < byte_cnt)
+ byte_cnt = pathlen;
+ pathlen -= byte_cnt;
+
+ memcpy(link, XFS_BUF_PTR(bp), byte_cnt);
+ xfs_buf_relse(bp);
+ }
+
+ link[ip->i_d.di_size] = '\0';
+ error = 0;
+
+ out:
+ return error;
+}
+
+int
+xfs_readlink(
+ xfs_inode_t *ip,
+ char *link)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_fsize_t pathlen;
+ int error = 0;
+
+ trace_xfs_readlink(ip);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+ pathlen = ip->i_d.di_size;
+ if (!pathlen)
+ goto out;
+
+ if (pathlen < 0 || pathlen > MAXPATHLEN) {
+ xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
+ __func__, (unsigned long long) ip->i_ino,
+ (long long) pathlen);
+ ASSERT(0);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto out;
+ }
+
+
+ if (ip->i_df.if_flags & XFS_IFINLINE) {
+ memcpy(link, ip->i_df.if_u1.if_data, pathlen);
+ link[pathlen] = '\0';
+ } else {
+ error = xfs_readlink_bmap(ip, link);
+ }
+
+ out:
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
+/*
+ * Flags for xfs_free_eofblocks
+ */
+#define XFS_FREE_EOF_TRYLOCK (1<<0)
+
+/*
+ * This is called by xfs_inactive to free any blocks beyond eof
+ * when the link count isn't zero and by xfs_dm_punch_hole() when
+ * punching a hole to EOF.
+ */
+STATIC int
+xfs_free_eofblocks(
+ xfs_mount_t *mp,
+ xfs_inode_t *ip,
+ int flags)
+{
+ xfs_trans_t *tp;
+ int error;
+ xfs_fileoff_t end_fsb;
+ xfs_fileoff_t last_fsb;
+ xfs_filblks_t map_len;
+ int nimaps;
+ xfs_bmbt_irec_t imap;
+
+ /*
+ * Figure out if there are any blocks beyond the end
+ * of the file. If not, then there is nothing to do.
+ */
+ end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
+ last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+ if (last_fsb <= end_fsb)
+ return 0;
+ map_len = last_fsb - end_fsb;
+
+ nimaps = 1;
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
+ NULL, 0, &imap, &nimaps, NULL);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!error && (nimaps != 0) &&
+ (imap.br_startblock != HOLESTARTBLOCK ||
+ ip->i_delayed_blks)) {
+ /*
+ * Attach the dquots to the inode up front.
+ */
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ /*
+ * There are blocks after the end of file.
+ * Free them up now by truncating the file to
+ * its current size.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+ /*
+ * Do the xfs_itruncate_start() call before
+ * reserving any log space because
+ * itruncate_start will call into the buffer
+ * cache and we can't
+ * do that within a transaction.
+ */
+ if (flags & XFS_FREE_EOF_TRYLOCK) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ xfs_trans_cancel(tp, 0);
+ return 0;
+ }
+ } else {
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ }
+ error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
+ ip->i_size);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ error = xfs_trans_reserve(tp, 0,
+ XFS_ITRUNCATE_LOG_RES(mp),
+ 0, XFS_TRANS_PERM_LOG_RES,
+ XFS_ITRUNCATE_LOG_COUNT);
+ if (error) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip);
+
+ error = xfs_itruncate_finish(&tp, ip,
+ ip->i_size,
+ XFS_DATA_FORK,
+ 0);
+ /*
+ * If we get an error at this point we
+ * simply don't bother truncating the file.
+ */
+ if (error) {
+ xfs_trans_cancel(tp,
+ (XFS_TRANS_RELEASE_LOG_RES |
+ XFS_TRANS_ABORT));
+ } else {
+ error = xfs_trans_commit(tp,
+ XFS_TRANS_RELEASE_LOG_RES);
+ }
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+ }
+ return error;
+}
+
+/*
+ * Free a symlink that has blocks associated with it.
+ */
+STATIC int
+xfs_inactive_symlink_rmt(
+ xfs_inode_t *ip,
+ xfs_trans_t **tpp)
+{
+ xfs_buf_t *bp;
+ int committed;
+ int done;
+ int error;
+ xfs_fsblock_t first_block;
+ xfs_bmap_free_t free_list;
+ int i;
+ xfs_mount_t *mp;
+ xfs_bmbt_irec_t mval[SYMLINK_MAPS];
+ int nmaps;
+ xfs_trans_t *ntp;
+ int size;
+ xfs_trans_t *tp;
+
+ tp = *tpp;
+ mp = ip->i_mount;
+ ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
+ /*
+ * We're freeing a symlink that has some
+ * blocks allocated to it. Free the
+ * blocks here. We know that we've got
+ * either 1 or 2 extents and that we can
+ * free them all in one bunmapi call.
+ */
+ ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+ if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ *tpp = NULL;
+ return error;
+ }
+ /*
+ * Lock the inode, fix the size, and join it to the transaction.
+ * Hold it so in the normal path, we still have it locked for
+ * the second transaction. In the error paths we need it
+ * held so the cancel won't rele it, see below.
+ */
+ xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ size = (int)ip->i_d.di_size;
+ ip->i_d.di_size = 0;
+ xfs_trans_ijoin(tp, ip);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ /*
+ * Find the block(s) so we can inval and unmap them.
+ */
+ done = 0;
+ xfs_bmap_init(&free_list, &first_block);
+ nmaps = ARRAY_SIZE(mval);
+ if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
+ XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
+ &free_list)))
+ goto error0;
+ /*
+ * Invalidate the block(s).
+ */
+ for (i = 0; i < nmaps; i++) {
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+ XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+ xfs_trans_binval(tp, bp);
+ }
+ /*
+ * Unmap the dead block(s) to the free_list.
+ */
+ if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+ &first_block, &free_list, &done)))
+ goto error1;
+ ASSERT(done);
+ /*
+ * Commit the first transaction. This logs the EFI and the inode.
+ */
+ if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
+ goto error1;
+ /*
+ * The transaction must have been committed, since there were
+ * actually extents freed by xfs_bunmapi. See xfs_bmap_finish.
+ * The new tp has the extent freeing and EFDs.
+ */
+ ASSERT(committed);
+ /*
+ * The first xact was committed, so add the inode to the new one.
+ * Mark it dirty so it will be logged and moved forward in the log as
+ * part of every commit.
+ */
+ xfs_trans_ijoin(tp, ip);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ /*
+ * Get a new, empty transaction to return to our caller.
+ */
+ ntp = xfs_trans_dup(tp);
+ /*
+ * Commit the transaction containing extent freeing and EFDs.
+ * If we get an error on the commit here or on the reserve below,
+ * we need to unlock the inode since the new transaction doesn't
+ * have the inode attached.
+ */
+ error = xfs_trans_commit(tp, 0);
+ tp = ntp;
+ if (error) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ goto error0;
+ }
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(tp->t_ticket);
+
+ /*
+ * Remove the memory for extent descriptions (just bookkeeping).
+ */
+ if (ip->i_df.if_bytes)
+ xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
+ ASSERT(ip->i_df.if_bytes == 0);
+ /*
+ * Put an itruncate log reservation in the new transaction
+ * for our caller.
+ */
+ if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ goto error0;
+ }
+ /*
+ * Return with the inode locked but not joined to the transaction.
+ */
+ *tpp = tp;
+ return 0;
+
+ error1:
+ xfs_bmap_cancel(&free_list);
+ error0:
+ /*
+ * Have to come here with the inode locked and either
+ * (held and in the transaction) or (not in the transaction).
+ * If the inode isn't held then cancel would iput it, but
+ * that's wrong since this is inactive and the vnode ref
+ * count is 0 already.
+ * Cancel won't do anything to the inode if held, but it still
+ * needs to be locked until the cancel is done, if it was
+ * joined to the transaction.
+ */
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ *tpp = NULL;
+ return error;
+
+}
+
+STATIC int
+xfs_inactive_symlink_local(
+ xfs_inode_t *ip,
+ xfs_trans_t **tpp)
+{
+ int error;
+
+ ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
+ /*
+ * We're freeing a symlink which fit into
+ * the inode. Just free the memory used
+ * to hold the old symlink.
+ */
+ error = xfs_trans_reserve(*tpp, 0,
+ XFS_ITRUNCATE_LOG_RES(ip->i_mount),
+ 0, XFS_TRANS_PERM_LOG_RES,
+ XFS_ITRUNCATE_LOG_COUNT);
+
+ if (error) {
+ xfs_trans_cancel(*tpp, 0);
+ *tpp = NULL;
+ return error;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+ /*
+ * Zero length symlinks _can_ exist.
+ */
+ if (ip->i_df.if_bytes > 0) {
+ xfs_idata_realloc(ip,
+ -(ip->i_df.if_bytes),
+ XFS_DATA_FORK);
+ ASSERT(ip->i_df.if_bytes == 0);
+ }
+ return 0;
+}
+
+STATIC int
+xfs_inactive_attrs(
+ xfs_inode_t *ip,
+ xfs_trans_t **tpp)
+{
+ xfs_trans_t *tp;
+ int error;
+ xfs_mount_t *mp;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ tp = *tpp;
+ mp = ip->i_mount;
+ ASSERT(ip->i_d.di_forkoff != 0);
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ goto error_unlock;
+
+ error = xfs_attr_inactive(ip);
+ if (error)
+ goto error_unlock;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+ error = xfs_trans_reserve(tp, 0,
+ XFS_IFREE_LOG_RES(mp),
+ 0, XFS_TRANS_PERM_LOG_RES,
+ XFS_INACTIVE_LOG_COUNT);
+ if (error)
+ goto error_cancel;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip);
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+ ASSERT(ip->i_d.di_anextents == 0);
+
+ *tpp = tp;
+ return 0;
+
+error_cancel:
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+error_unlock:
+ *tpp = NULL;
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+int
+xfs_release(
+ xfs_inode_t *ip)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ int error;
+
+ if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+ return 0;
+
+ /* If this is a read-only mount, don't do this (would generate I/O) */
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return 0;
+
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ int truncated;
+
+ /*
+ * If we are using filestreams, and we have an unlinked
+ * file that we are processing the last close on, then nothing
+ * will be able to reopen and write to this file. Purge this
+ * inode from the filestreams cache so that it doesn't delay
+ * teardown of the inode.
+ */
+ if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
+ xfs_filestream_deassociate(ip);
+
+ /*
+ * If we previously truncated this file and removed old data
+ * in the process, we want to initiate "early" writeout on
+ * the last close. This is an attempt to combat the notorious
+ * NULL files problem which is particularly noticeable from a
+ * truncate down, buffered (re-)write (delalloc), followed by
+ * a crash. What we are effectively doing here is
+ * significantly reducing the time window where we'd otherwise
+ * be exposed to that problem.
+ */
+ truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
+ if (truncated) {
+ xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
+ if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
+ xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
+ }
+ }
+
+ if (ip->i_d.di_nlink == 0)
+ return 0;
+
+ if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+ ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+ ip->i_delayed_blks > 0)) &&
+ (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
+ (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+
+ /*
+ * If we can't get the iolock just skip truncating the blocks
+ * past EOF because we could deadlock with the mmap_sem
+ * otherwise. We'll get another chance to drop them once the
+ * last reference to the inode is dropped, so we'll never leak
+ * blocks permanently.
+ *
+ * Further, check if the inode is being opened, written and
+ * closed frequently and we have delayed allocation blocks
+ * outstanding (e.g. streaming writes from the NFS server),
+ * truncating the blocks past EOF will cause fragmentation to
+ * occur.
+ *
+ * In this case don't do the truncation, either, but we have to
+ * be careful how we detect this case. Blocks beyond EOF show
+ * up as i_delayed_blks even when the inode is clean, so we
+ * need to truncate them away first before checking for a dirty
+ * release. Hence on the first dirty close we will still remove
+ * the speculative allocation, but after that we will leave it
+ * in place.
+ */
+ if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+ return 0;
+
+ error = xfs_free_eofblocks(mp, ip,
+ XFS_FREE_EOF_TRYLOCK);
+ if (error)
+ return error;
+
+ /* delalloc blocks after truncation means it really is dirty */
+ if (ip->i_delayed_blks)
+ xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+ }
+ return 0;
+}
+
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero. If the file has been unlinked, then it must
+ * now be truncated. Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+int
+xfs_inactive(
+ xfs_inode_t *ip)
+{
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int committed;
+ xfs_trans_t *tp;
+ xfs_mount_t *mp;
+ int error;
+ int truncate;
+
+ /*
+ * If the inode is already free, then there can be nothing
+ * to clean up here.
+ */
+ if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
+ ASSERT(ip->i_df.if_real_bytes == 0);
+ ASSERT(ip->i_df.if_broot_bytes == 0);
+ return VN_INACTIVE_CACHE;
+ }
+
+ /*
+ * Only do a truncate if it's a regular file with
+ * some actual space in it. It's OK to look at the
+ * inode's fields without the lock because we're the
+ * only one with a reference to the inode.
+ */
+ truncate = ((ip->i_d.di_nlink == 0) &&
+ ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
+ (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
+ ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
+
+ mp = ip->i_mount;
+
+ error = 0;
+
+ /* If this is a read-only mount, don't do this (would generate I/O) */
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ goto out;
+
+ if (ip->i_d.di_nlink != 0) {
+ if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+ ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+ ip->i_delayed_blks > 0)) &&
+ (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+ (!(ip->i_d.di_flags &
+ (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
+ (ip->i_delayed_blks != 0)))) {
+ error = xfs_free_eofblocks(mp, ip, 0);
+ if (error)
+ return VN_INACTIVE_CACHE;
+ }
+ goto out;
+ }
+
+ ASSERT(ip->i_d.di_nlink == 0);
+
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return VN_INACTIVE_CACHE;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+ if (truncate) {
+ /*
+ * Do the xfs_itruncate_start() call before
+ * reserving any log space because itruncate_start
+ * will call into the buffer cache and we can't
+ * do that within a transaction.
+ */
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return VN_INACTIVE_CACHE;
+ }
+
+ error = xfs_trans_reserve(tp, 0,
+ XFS_ITRUNCATE_LOG_RES(mp),
+ 0, XFS_TRANS_PERM_LOG_RES,
+ XFS_ITRUNCATE_LOG_COUNT);
+ if (error) {
+ /* Don't call itruncate_cleanup */
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return VN_INACTIVE_CACHE;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip);
+
+ /*
+ * normally, we have to run xfs_itruncate_finish sync.
+ * But if filesystem is wsync and we're in the inactive
+ * path, then we know that nlink == 0, and that the
+ * xaction that made nlink == 0 is permanently committed
+ * since xfs_remove runs as a synchronous transaction.
+ */
+ error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
+ (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
+
+ if (error) {
+ xfs_trans_cancel(tp,
+ XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ return VN_INACTIVE_CACHE;
+ }
+ } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
+
+ /*
+ * If we get an error while cleaning up a
+ * symlink we bail out.
+ */
+ error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
+ xfs_inactive_symlink_rmt(ip, &tp) :
+ xfs_inactive_symlink_local(ip, &tp);
+
+ if (error) {
+ ASSERT(tp == NULL);
+ return VN_INACTIVE_CACHE;
+ }
+
+ xfs_trans_ijoin(tp, ip);
+ } else {
+ error = xfs_trans_reserve(tp, 0,
+ XFS_IFREE_LOG_RES(mp),
+ 0, XFS_TRANS_PERM_LOG_RES,
+ XFS_INACTIVE_LOG_COUNT);
+ if (error) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ return VN_INACTIVE_CACHE;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+ xfs_trans_ijoin(tp, ip);
+ }
+
+ /*
+ * If there are attributes associated with the file
+ * then blow them away now. The code calls a routine
+ * that recursively deconstructs the attribute fork.
+ * We need to just commit the current transaction
+ * because we can't use it for xfs_attr_inactive().
+ */
+ if (ip->i_d.di_anextents > 0) {
+ error = xfs_inactive_attrs(ip, &tp);
+ /*
+ * If we got an error, the transaction is already
+ * cancelled, and the inode is unlocked. Just get out.
+ */
+ if (error)
+ return VN_INACTIVE_CACHE;
+ } else if (ip->i_afp) {
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+ }
+
+ /*
+ * Free the inode.
+ */
+ xfs_bmap_init(&free_list, &first_block);
+ error = xfs_ifree(tp, ip, &free_list);
+ if (error) {
+ /*
+ * If we fail to free the inode, shut down. The cancel
+ * might do that, we need to make sure. Otherwise the
+ * inode might be lost for a long time or forever.
+ */
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_notice(mp, "%s: xfs_ifree returned error %d",
+ __func__, error);
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ }
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ } else {
+ /*
+ * Credit the quota account(s). The inode is gone.
+ */
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+
+ /*
+ * Just ignore errors at this point. There is nothing we can
+ * do except to try to keep going. Make sure it's not a silent
+ * error.
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+ __func__, error);
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+ __func__, error);
+ }
+
+ /*
+ * Release the dquots held by inode, if any.
+ */
+ xfs_qm_dqdetach(ip);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+
+ out:
+ return VN_INACTIVE_CACHE;
+}
+
+/*
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
+ */
+int
+xfs_lookup(
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ xfs_inode_t **ipp,
+ struct xfs_name *ci_name)
+{
+ xfs_ino_t inum;
+ int error;
+ uint lock_mode;
+
+ trace_xfs_lookup(dp, name);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return XFS_ERROR(EIO);
+
+ lock_mode = xfs_ilock_map_shared(dp);
+ error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
+ xfs_iunlock_map_shared(dp, lock_mode);
+
+ if (error)
+ goto out;
+
+ error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
+ if (error)
+ goto out_free_name;
+
+ return 0;
+
+out_free_name:
+ if (ci_name)
+ kmem_free(ci_name->name);
+out:
+ *ipp = NULL;
+ return error;
+}
+
+int
+xfs_create(
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ mode_t mode,
+ xfs_dev_t rdev,
+ xfs_inode_t **ipp)
+{
+ int is_dir = S_ISDIR(mode);
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_inode *ip = NULL;
+ struct xfs_trans *tp = NULL;
+ int error;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ boolean_t unlock_dp_on_error = B_FALSE;
+ uint cancel_flags;
+ int committed;
+ prid_t prid;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ uint resblks;
+ uint log_res;
+ uint log_count;
+
+ trace_xfs_create(dp, name);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+ prid = xfs_get_projid(dp);
+ else
+ prid = XFS_PROJID_DEFAULT;
+
+ /*
+ * Make sure that we have allocated dquot(s) on disk.
+ */
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+ if (error)
+ return error;
+
+ if (is_dir) {
+ rdev = 0;
+ resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+ log_res = XFS_MKDIR_LOG_RES(mp);
+ log_count = XFS_MKDIR_LOG_COUNT;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+ } else {
+ resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+ log_res = XFS_CREATE_LOG_RES(mp);
+ log_count = XFS_CREATE_LOG_COUNT;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+ }
+
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+ /*
+ * Initially assume that the file does not exist and
+ * reserve the resources for that case. If that is not
+ * the case we'll drop the one we have and get a more
+ * appropriate transaction later.
+ */
+ error = xfs_trans_reserve(tp, resblks, log_res, 0,
+ XFS_TRANS_PERM_LOG_RES, log_count);
+ if (error == ENOSPC) {
+ /* flush outstanding delalloc blocks and retry */
+ xfs_flush_inodes(dp);
+ error = xfs_trans_reserve(tp, resblks, log_res, 0,
+ XFS_TRANS_PERM_LOG_RES, log_count);
+ }
+ if (error == ENOSPC) {
+ /* No space at all so try a "no-allocation" reservation */
+ resblks = 0;
+ error = xfs_trans_reserve(tp, 0, log_res, 0,
+ XFS_TRANS_PERM_LOG_RES, log_count);
+ }
+ if (error) {
+ cancel_flags = 0;
+ goto out_trans_cancel;
+ }
+
+ xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ unlock_dp_on_error = B_TRUE;
+
+ /*
+ * Check for directory link count overflow.
+ */
+ if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
+ error = XFS_ERROR(EMLINK);
+ goto out_trans_cancel;
+ }
+
+ xfs_bmap_init(&free_list, &first_block);
+
+ /*
+ * Reserve disk quota and the inode.
+ */
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_dir_canenter(tp, dp, name, resblks);
+ if (error)
+ goto out_trans_cancel;
+
+ /*
+ * A newly created regular or special file just has one directory
+ * entry pointing to them, but a directory also the "." entry
+ * pointing to itself.
+ */
+ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
+ prid, resblks > 0, &ip, &committed);
+ if (error) {
+ if (error == ENOSPC)
+ goto out_trans_cancel;
+ goto out_trans_abort;
+ }
+
+ /*
+ * Now we join the directory inode to the transaction. We do not do it
+ * earlier because xfs_dir_ialloc might commit the previous transaction
+ * (and release all the locks). An error from here on will result in
+ * the transaction cancel unlocking dp so don't do it explicitly in the
+ * error path.
+ */
+ xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+ unlock_dp_on_error = B_FALSE;
+
+ error = xfs_dir_createname(tp, dp, name, ip->i_ino,
+ &first_block, &free_list, resblks ?
+ resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+ if (error) {
+ ASSERT(error != ENOSPC);
+ goto out_trans_abort;
+ }
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ if (is_dir) {
+ error = xfs_dir_init(tp, ip, dp);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_bumplink(tp, dp);
+ if (error)
+ goto out_bmap_cancel;
+ }
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * create transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ xfs_trans_set_sync(tp);
+
+ /*
+ * Attach the dquot(s) to the inodes and modify them incore.
+ * These ids of the inode couldn't have changed since the new
+ * inode has been locked ever since it was created.
+ */
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto out_release_inode;
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+
+ *ipp = ip;
+ return 0;
+
+ out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+ out_trans_abort:
+ cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+ /*
+ * Wait until after the current transaction is aborted to
+ * release the inode. This prevents recursive transactions
+ * and deadlocks from xfs_inactive.
+ */
+ if (ip)
+ IRELE(ip);
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+
+ if (unlock_dp_on_error)
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ return error;
+}
+
+#ifdef DEBUG
+int xfs_locked_n;
+int xfs_small_retries;
+int xfs_middle_retries;
+int xfs_lots_retries;
+int xfs_lock_delays;
+#endif
+
+/*
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with
+ * a different value
+ */
+static inline int
+xfs_lock_inumorder(int lock_mode, int subclass)
+{
+ if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+ lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+ if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
+ lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
+
+ return lock_mode;
+}
+
+/*
+ * The following routine will lock n inodes in exclusive mode.
+ * We assume the caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock
+ * is in the AIL and we start waiting for another inode that is locked
+ * by a thread in a long running transaction (such as truncate). This can
+ * result in deadlock since the long running trans might need to wait
+ * for the inode we just locked in order to push the tail and free space
+ * in the log.
+ */
+void
+xfs_lock_inodes(
+ xfs_inode_t **ips,
+ int inodes,
+ uint lock_mode)
+{
+ int attempts = 0, i, j, try_lock;
+ xfs_log_item_t *lp;
+
+ ASSERT(ips && (inodes >= 2)); /* we need at least two */
+
+ try_lock = 0;
+ i = 0;
+
+again:
+ for (; i < inodes; i++) {
+ ASSERT(ips[i]);
+
+ if (i && (ips[i] == ips[i-1])) /* Already locked */
+ continue;
+
+ /*
+ * If try_lock is not set yet, make sure all locked inodes
+ * are not in the AIL.
+ * If any are, set try_lock to be used later.
+ */
+
+ if (!try_lock) {
+ for (j = (i - 1); j >= 0 && !try_lock; j--) {
+ lp = (xfs_log_item_t *)ips[j]->i_itemp;
+ if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+ try_lock++;
+ }
+ }
+ }
+
+ /*
+ * If any of the previous locks we have locked is in the AIL,
+ * we must TRY to get the second and subsequent locks. If
+ * we can't get any, we must release all we have
+ * and try again.
+ */
+
+ if (try_lock) {
+ /* try_lock must be 0 if i is 0. */
+ /*
+ * try_lock means we have an inode locked
+ * that is in the AIL.
+ */
+ ASSERT(i != 0);
+ if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
+ attempts++;
+
+ /*
+ * Unlock all previous guys and try again.
+ * xfs_iunlock will try to push the tail
+ * if the inode is in the AIL.
+ */
+
+ for(j = i - 1; j >= 0; j--) {
+
+ /*
+ * Check to see if we've already
+ * unlocked this one.
+ * Not the first one going back,
+ * and the inode ptr is the same.
+ */
+ if ((j != (i - 1)) && ips[j] ==
+ ips[j+1])
+ continue;
+
+ xfs_iunlock(ips[j], lock_mode);
+ }
+
+ if ((attempts % 5) == 0) {
+ delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+ xfs_lock_delays++;
+#endif
+ }
+ i = 0;
+ try_lock = 0;
+ goto again;
+ }
+ } else {
+ xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
+ }
+ }
+
+#ifdef DEBUG
+ if (attempts) {
+ if (attempts < 5) xfs_small_retries++;
+ else if (attempts < 100) xfs_middle_retries++;
+ else xfs_lots_retries++;
+ } else {
+ xfs_locked_n++;
+ }
+#endif
+}
+
+/*
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
+ */
+void
+xfs_lock_two_inodes(
+ xfs_inode_t *ip0,
+ xfs_inode_t *ip1,
+ uint lock_mode)
+{
+ xfs_inode_t *temp;
+ int attempts = 0;
+ xfs_log_item_t *lp;
+
+ if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+ ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+ ASSERT(ip0->i_ino != ip1->i_ino);
+
+ if (ip0->i_ino > ip1->i_ino) {
+ temp = ip0;
+ ip0 = ip1;
+ ip1 = temp;
+ }
+
+ again:
+ xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
+
+ /*
+ * If the first lock we have locked is in the AIL, we must TRY to get
+ * the second lock. If we can't get it, we must release the first one
+ * and try again.
+ */
+ lp = (xfs_log_item_t *)ip0->i_itemp;
+ if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+ if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+ xfs_iunlock(ip0, lock_mode);
+ if ((++attempts % 5) == 0)
+ delay(1); /* Don't just spin the CPU */
+ goto again;
+ }
+ } else {
+ xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
+ }
+}
+
+int
+xfs_remove(
+ xfs_inode_t *dp,
+ struct xfs_name *name,
+ xfs_inode_t *ip)
+{
+ xfs_mount_t *mp = dp->i_mount;
+ xfs_trans_t *tp = NULL;
+ int is_dir = S_ISDIR(ip->i_d.di_mode);
+ int error = 0;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int cancel_flags;
+ int committed;
+ int link_zero;
+ uint resblks;
+ uint log_count;
+
+ trace_xfs_remove(dp, name);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ error = xfs_qm_dqattach(dp, 0);
+ if (error)
+ goto std_return;
+
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ goto std_return;
+
+ if (is_dir) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+ log_count = XFS_DEFAULT_LOG_COUNT;
+ } else {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+ log_count = XFS_REMOVE_LOG_COUNT;
+ }
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+ /*
+ * We try to get the real space reservation first,
+ * allowing for directory btree deletion(s) implying
+ * possible bmap insert(s). If we can't get the space
+ * reservation then we use 0 instead, and avoid the bmap
+ * btree insert(s) in the directory code by, if the bmap
+ * insert tries to happen, instead trimming the LAST
+ * block from the directory.
+ */
+ resblks = XFS_REMOVE_SPACE_RES(mp);
+ error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, log_count);
+ if (error == ENOSPC) {
+ resblks = 0;
+ error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, log_count);
+ }
+ if (error) {
+ ASSERT(error != ENOSPC);
+ cancel_flags = 0;
+ goto out_trans_cancel;
+ }
+
+ xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+
+ /*
+ * If we're removing a directory perform some additional validation.
+ */
+ if (is_dir) {
+ ASSERT(ip->i_d.di_nlink >= 2);
+ if (ip->i_d.di_nlink != 2) {
+ error = XFS_ERROR(ENOTEMPTY);
+ goto out_trans_cancel;
+ }
+ if (!xfs_dir_isempty(ip)) {
+ error = XFS_ERROR(ENOTEMPTY);
+ goto out_trans_cancel;
+ }
+ }
+
+ xfs_bmap_init(&free_list, &first_block);
+ error = xfs_dir_removename(tp, dp, name, ip->i_ino,
+ &first_block, &free_list, resblks);
+ if (error) {
+ ASSERT(error != ENOENT);
+ goto out_bmap_cancel;
+ }
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ if (is_dir) {
+ /*
+ * Drop the link from ip's "..".
+ */
+ error = xfs_droplink(tp, dp);
+ if (error)
+ goto out_bmap_cancel;
+
+ /*
+ * Drop the "." link from ip to self.
+ */
+ error = xfs_droplink(tp, ip);
+ if (error)
+ goto out_bmap_cancel;
+ } else {
+ /*
+ * When removing a non-directory we need to log the parent
+ * inode here. For a directory this is done implicitly
+ * by the xfs_droplink call for the ".." entry.
+ */
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ }
+
+ /*
+ * Drop the link from dp to ip.
+ */
+ error = xfs_droplink(tp, ip);
+ if (error)
+ goto out_bmap_cancel;
+
+ /*
+ * Determine if this is the last link while
+ * we are in the transaction.
+ */
+ link_zero = (ip->i_d.di_nlink == 0);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * remove transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ xfs_trans_set_sync(tp);
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto std_return;
+
+ /*
+ * If we are using filestreams, kill the stream association.
+ * If the file is still open it may get a new one but that
+ * will get killed on last close in xfs_close() so we don't
+ * have to worry about that.
+ */
+ if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
+ xfs_filestream_deassociate(ip);
+
+ return 0;
+
+ out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+ cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+ return error;
+}
+
+int
+xfs_link(
+ xfs_inode_t *tdp,
+ xfs_inode_t *sip,
+ struct xfs_name *target_name)
+{
+ xfs_mount_t *mp = tdp->i_mount;
+ xfs_trans_t *tp;
+ int error;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int cancel_flags;
+ int committed;
+ int resblks;
+
+ trace_xfs_link(tdp, target_name);
+
+ ASSERT(!S_ISDIR(sip->i_d.di_mode));
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ error = xfs_qm_dqattach(sip, 0);
+ if (error)
+ goto std_return;
+
+ error = xfs_qm_dqattach(tdp, 0);
+ if (error)
+ goto std_return;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+ error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+ if (error == ENOSPC) {
+ resblks = 0;
+ error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+ }
+ if (error) {
+ cancel_flags = 0;
+ goto error_return;
+ }
+
+ xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin_ref(tp, sip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, tdp, XFS_ILOCK_EXCL);
+
+ /*
+ * If the source has too many links, we can't make any more to it.
+ */
+ if (sip->i_d.di_nlink >= XFS_MAXLINK) {
+ error = XFS_ERROR(EMLINK);
+ goto error_return;
+ }
+
+ /*
+ * If we are using project inheritance, we only allow hard link
+ * creation in our tree when the project IDs are the same; else
+ * the tree quota mechanism could be circumvented.
+ */
+ if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+ (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
+ error = XFS_ERROR(EXDEV);
+ goto error_return;
+ }
+
+ error = xfs_dir_canenter(tp, tdp, target_name, resblks);
+ if (error)
+ goto error_return;
+
+ xfs_bmap_init(&free_list, &first_block);
+
+ error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+ &first_block, &free_list, resblks);
+ if (error)
+ goto abort_return;
+ xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+
+ error = xfs_bumplink(tp, sip);
+ if (error)
+ goto abort_return;
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * link transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ xfs_trans_set_sync(tp);
+ }
+
+ error = xfs_bmap_finish (&tp, &free_list, &committed);
+ if (error) {
+ xfs_bmap_cancel(&free_list);
+ goto abort_return;
+ }
+
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+ cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+ xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+ return error;
+}
+
+int
+xfs_symlink(
+ xfs_inode_t *dp,
+ struct xfs_name *link_name,
+ const char *target_path,
+ mode_t mode,
+ xfs_inode_t **ipp)
+{
+ xfs_mount_t *mp = dp->i_mount;
+ xfs_trans_t *tp;
+ xfs_inode_t *ip;
+ int error;
+ int pathlen;
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ boolean_t unlock_dp_on_error = B_FALSE;
+ uint cancel_flags;
+ int committed;
+ xfs_fileoff_t first_fsb;
+ xfs_filblks_t fs_blocks;
+ int nmaps;
+ xfs_bmbt_irec_t mval[SYMLINK_MAPS];
+ xfs_daddr_t d;
+ const char *cur_chunk;
+ int byte_cnt;
+ int n;
+ xfs_buf_t *bp;
+ prid_t prid;
+ struct xfs_dquot *udqp, *gdqp;
+ uint resblks;
+
+ *ipp = NULL;
+ error = 0;
+ ip = NULL;
+ tp = NULL;
+
+ trace_xfs_symlink(dp, link_name);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ /*
+ * Check component lengths of the target path name.
+ */
+ pathlen = strlen(target_path);
+ if (pathlen >= MAXPATHLEN) /* total string too long */
+ return XFS_ERROR(ENAMETOOLONG);
+
+ udqp = gdqp = NULL;
+ if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+ prid = xfs_get_projid(dp);
+ else
+ prid = XFS_PROJID_DEFAULT;
+
+ /*
+ * Make sure that we have allocated dquot(s) on disk.
+ */
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+ if (error)
+ goto std_return;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ /*
+ * The symlink will fit into the inode data fork?
+ * There can't be any attributes so we get the whole variable part.
+ */
+ if (pathlen <= XFS_LITINO(mp))
+ fs_blocks = 0;
+ else
+ fs_blocks = XFS_B_TO_FSB(mp, pathlen);
+ resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
+ error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+ if (error == ENOSPC && fs_blocks == 0) {
+ resblks = 0;
+ error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+ }
+ if (error) {
+ cancel_flags = 0;
+ goto error_return;
+ }
+
+ xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ unlock_dp_on_error = B_TRUE;
+
+ /*
+ * Check whether the directory allows new symlinks or not.
+ */
+ if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
+ error = XFS_ERROR(EPERM);
+ goto error_return;
+ }
+
+ /*
+ * Reserve disk quota : blocks and inode.
+ */
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+ if (error)
+ goto error_return;
+
+ /*
+ * Check for ability to enter directory entry, if no space reserved.
+ */
+ error = xfs_dir_canenter(tp, dp, link_name, resblks);
+ if (error)
+ goto error_return;
+ /*
+ * Initialize the bmap freelist prior to calling either
+ * bmapi or the directory create code.
+ */
+ xfs_bmap_init(&free_list, &first_block);
+
+ /*
+ * Allocate an inode for the symlink.
+ */
+ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
+ prid, resblks > 0, &ip, NULL);
+ if (error) {
+ if (error == ENOSPC)
+ goto error_return;
+ goto error1;
+ }
+
+ /*
+ * An error after we've joined dp to the transaction will result in the
+ * transaction cancel unlocking dp so don't do it explicitly in the
+ * error path.
+ */
+ xfs_trans_ijoin_ref(tp, dp, XFS_ILOCK_EXCL);
+ unlock_dp_on_error = B_FALSE;
+
+ /*
+ * Also attach the dquot(s) to it, if applicable.
+ */
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+
+ if (resblks)
+ resblks -= XFS_IALLOC_SPACE_RES(mp);
+ /*
+ * If the symlink will fit into the inode, write it inline.
+ */
+ if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+ xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
+ memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
+ ip->i_d.di_size = pathlen;
+
+ /*
+ * The inode was initially created in extent format.
+ */
+ ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+ ip->i_df.if_flags |= XFS_IFINLINE;
+
+ ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+
+ } else {
+ first_fsb = 0;
+ nmaps = SYMLINK_MAPS;
+
+ error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
+ XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
+ &first_block, resblks, mval, &nmaps,
+ &free_list);
+ if (error)
+ goto error2;
+
+ if (resblks)
+ resblks -= fs_blocks;
+ ip->i_d.di_size = pathlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ cur_chunk = target_path;
+ for (n = 0; n < nmaps; n++) {
+ d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+ byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ BTOBB(byte_cnt), 0);
+ ASSERT(bp && !XFS_BUF_GETERROR(bp));
+ if (pathlen < byte_cnt) {
+ byte_cnt = pathlen;
+ }
+ pathlen -= byte_cnt;
+
+ memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
+ cur_chunk += byte_cnt;
+
+ xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
+ }
+ }
+
+ /*
+ * Create the directory entry for the symlink.
+ */
+ error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
+ &first_block, &free_list, resblks);
+ if (error)
+ goto error2;
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * symlink transaction goes to disk before returning to
+ * the user.
+ */
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ xfs_trans_set_sync(tp);
+ }
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error) {
+ goto error2;
+ }
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+
+ *ipp = ip;
+ return 0;
+
+ error2:
+ IRELE(ip);
+ error1:
+ xfs_bmap_cancel(&free_list);
+ cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+ xfs_trans_cancel(tp, cancel_flags);
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+
+ if (unlock_dp_on_error)
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ std_return:
+ return error;
+}
+
+int
+xfs_set_dmattrs(
+ xfs_inode_t *ip,
+ u_int evmask,
+ u_int16_t state)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_trans_t *tp;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return XFS_ERROR(EPERM);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+ error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+
+ ip->i_d.di_dmevmask = evmask;
+ ip->i_d.di_dmstate = state;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ error = xfs_trans_commit(tp, 0);
+
+ return error;
+}
+
+/*
+ * xfs_alloc_file_space()
+ * This routine allocates disk space for the given file.
+ *
+ * If alloc_type == 0, this request is for an ALLOCSP type
+ * request which will change the file size. In this case, no
+ * DMAPI event will be generated by the call. A TRUNCATE event
+ * will be generated later by xfs_setattr.
+ *
+ * If alloc_type != 0, this request is for a RESVSP type
+ * request, and a DMAPI DM_EVENT_WRITE will be generated if the
+ * lower block boundary byte address is less than the file's
+ * length.
+ *
+ * RETURNS:
+ * 0 on success
+ * errno on error
+ *
+ */
+STATIC int
+xfs_alloc_file_space(
+ xfs_inode_t *ip,
+ xfs_off_t offset,
+ xfs_off_t len,
+ int alloc_type,
+ int attr_flags)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_off_t count;
+ xfs_filblks_t allocated_fsb;
+ xfs_filblks_t allocatesize_fsb;
+ xfs_extlen_t extsz, temp;
+ xfs_fileoff_t startoffset_fsb;
+ xfs_fsblock_t firstfsb;
+ int nimaps;
+ int bmapi_flag;
+ int quota_flag;
+ int rt;
+ xfs_trans_t *tp;
+ xfs_bmbt_irec_t imaps[1], *imapp;
+ xfs_bmap_free_t free_list;
+ uint qblocks, resblks, resrtextents;
+ int committed;
+ int error;
+
+ trace_xfs_alloc_file_space(ip);
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ if (len <= 0)
+ return XFS_ERROR(EINVAL);
+
+ rt = XFS_IS_REALTIME_INODE(ip);
+ extsz = xfs_get_extsz_hint(ip);
+
+ count = len;
+ imapp = &imaps[0];
+ nimaps = 1;
+ bmapi_flag = XFS_BMAPI_WRITE | alloc_type;
+ startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
+ allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+
+ /*
+ * Allocate file space until done or until there is an error
+ */
+ while (allocatesize_fsb && !error) {
+ xfs_fileoff_t s, e;
+
+ /*
+ * Determine space reservations for data/realtime.
+ */
+ if (unlikely(extsz)) {
+ s = startoffset_fsb;
+ do_div(s, extsz);
+ s *= extsz;
+ e = startoffset_fsb + allocatesize_fsb;
+ if ((temp = do_mod(startoffset_fsb, extsz)))
+ e += temp;
+ if ((temp = do_mod(e, extsz)))
+ e += extsz - temp;
+ } else {
+ s = 0;
+ e = allocatesize_fsb;
+ }
+
+ /*
+ * The transaction reservation is limited to a 32-bit block
+ * count, hence we need to limit the number of blocks we are
+ * trying to reserve to avoid an overflow. We can't allocate
+ * more than @nimaps extents, and an extent is limited on disk
+ * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+ */
+ resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+ if (unlikely(rt)) {
+ resrtextents = qblocks = resblks;
+ resrtextents /= mp->m_sb.sb_rextsize;
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ quota_flag = XFS_QMOPT_RES_RTBLKS;
+ } else {
+ resrtextents = 0;
+ resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+ quota_flag = XFS_QMOPT_RES_REGBLKS;
+ }
+
+ /*
+ * Allocate and setup the transaction.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+ error = xfs_trans_reserve(tp, resblks,
+ XFS_WRITE_LOG_RES(mp), resrtextents,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+ /*
+ * Check for running out of space
+ */
+ if (error) {
+ /*
+ * Free the transaction structure.
+ */
+ ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ break;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
+ 0, quota_flag);
+ if (error)
+ goto error1;
+
+ xfs_trans_ijoin(tp, ip);
+
+ /*
+ * Issue the xfs_bmapi() call to allocate the blocks
+ */
+ xfs_bmap_init(&free_list, &firstfsb);
+ error = xfs_bmapi(tp, ip, startoffset_fsb,
+ allocatesize_fsb, bmapi_flag,
+ &firstfsb, 0, imapp, &nimaps,
+ &free_list);
+ if (error) {
+ goto error0;
+ }
+
+ /*
+ * Complete the transaction
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error) {
+ goto error0;
+ }
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error) {
+ break;
+ }
+
+ allocated_fsb = imapp->br_blockcount;
+
+ if (nimaps == 0) {
+ error = XFS_ERROR(ENOSPC);
+ break;
+ }
+
+ startoffset_fsb += allocated_fsb;
+ allocatesize_fsb -= allocated_fsb;
+ }
+
+ return error;
+
+error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
+ xfs_bmap_cancel(&free_list);
+ xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+
+error1: /* Just cancel transaction */
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Zero file bytes between startoff and endoff inclusive.
+ * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
+ */
+STATIC int
+xfs_zero_remaining_bytes(
+ xfs_inode_t *ip,
+ xfs_off_t startoff,
+ xfs_off_t endoff)
+{
+ xfs_bmbt_irec_t imap;
+ xfs_fileoff_t offset_fsb;
+ xfs_off_t lastoffset;
+ xfs_off_t offset;
+ xfs_buf_t *bp;
+ xfs_mount_t *mp = ip->i_mount;
+ int nimap;
+ int error = 0;
+
+ /*
+ * Avoid doing I/O beyond eof - it's not necessary
+ * since nothing can read beyond eof. The space will
+ * be zeroed when the file is extended anyway.
+ */
+ if (startoff >= ip->i_size)
+ return 0;
+
+ if (endoff > ip->i_size)
+ endoff = ip->i_size;
+
+ bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
+ mp->m_rtdev_targp : mp->m_ddev_targp,
+ mp->m_sb.sb_blocksize, XBF_DONT_BLOCK);
+ if (!bp)
+ return XFS_ERROR(ENOMEM);
+
+ for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ nimap = 1;
+ error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0,
+ NULL, 0, &imap, &nimap, NULL);
+ if (error || nimap < 1)
+ break;
+ ASSERT(imap.br_blockcount >= 1);
+ ASSERT(imap.br_startoff == offset_fsb);
+ lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
+ if (lastoffset > endoff)
+ lastoffset = endoff;
+ if (imap.br_startblock == HOLESTARTBLOCK)
+ continue;
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ if (imap.br_state == XFS_EXT_UNWRITTEN)
+ continue;
+ XFS_BUF_UNDONE(bp);
+ XFS_BUF_UNWRITE(bp);
+ XFS_BUF_READ(bp);
+ XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
+ xfsbdstrat(mp, bp);
+ error = xfs_buf_iowait(bp);
+ if (error) {
+ xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
+ mp, bp, XFS_BUF_ADDR(bp));
+ break;
+ }
+ memset(XFS_BUF_PTR(bp) +
+ (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+ 0, lastoffset - offset + 1);
+ XFS_BUF_UNDONE(bp);
+ XFS_BUF_UNREAD(bp);
+ XFS_BUF_WRITE(bp);
+ xfsbdstrat(mp, bp);
+ error = xfs_buf_iowait(bp);
+ if (error) {
+ xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
+ mp, bp, XFS_BUF_ADDR(bp));
+ break;
+ }
+ }
+ xfs_buf_free(bp);
+ return error;
+}
+
+/*
+ * xfs_free_file_space()
+ * This routine frees disk space for the given file.
+ *
+ * This routine is only called by xfs_change_file_space
+ * for an UNRESVSP type call.
+ *
+ * RETURNS:
+ * 0 on success
+ * errno on error
+ *
+ */
+STATIC int
+xfs_free_file_space(
+ xfs_inode_t *ip,
+ xfs_off_t offset,
+ xfs_off_t len,
+ int attr_flags)
+{
+ int committed;
+ int done;
+ xfs_fileoff_t endoffset_fsb;
+ int error;
+ xfs_fsblock_t firstfsb;
+ xfs_bmap_free_t free_list;
+ xfs_bmbt_irec_t imap;
+ xfs_off_t ioffset;
+ xfs_extlen_t mod=0;
+ xfs_mount_t *mp;
+ int nimap;
+ uint resblks;
+ uint rounding;
+ int rt;
+ xfs_fileoff_t startoffset_fsb;
+ xfs_trans_t *tp;
+ int need_iolock = 1;
+
+ mp = ip->i_mount;
+
+ trace_xfs_free_file_space(ip);
+
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ error = 0;
+ if (len <= 0) /* if nothing being freed */
+ return error;
+ rt = XFS_IS_REALTIME_INODE(ip);
+ startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+ endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+
+ if (attr_flags & XFS_ATTR_NOLOCK)
+ need_iolock = 0;
+ if (need_iolock) {
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ /* wait for the completion of any pending DIOs */
+ xfs_ioend_wait(ip);
+ }
+
+ rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+ ioffset = offset & ~(rounding - 1);
+
+ if (VN_CACHED(VFS_I(ip)) != 0) {
+ error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
+ if (error)
+ goto out_unlock_iolock;
+ }
+
+ /*
+ * Need to zero the stuff we're not freeing, on disk.
+ * If it's a realtime file & can't use unwritten extents then we
+ * actually need to zero the extent edges. Otherwise xfs_bunmapi
+ * will take care of it for us.
+ */
+ if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ nimap = 1;
+ error = xfs_bmapi(NULL, ip, startoffset_fsb,
+ 1, 0, NULL, 0, &imap, &nimap, NULL);
+ if (error)
+ goto out_unlock_iolock;
+ ASSERT(nimap == 0 || nimap == 1);
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ xfs_daddr_t block;
+
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ block = imap.br_startblock;
+ mod = do_div(block, mp->m_sb.sb_rextsize);
+ if (mod)
+ startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+ }
+ nimap = 1;
+ error = xfs_bmapi(NULL, ip, endoffset_fsb - 1,
+ 1, 0, NULL, 0, &imap, &nimap, NULL);
+ if (error)
+ goto out_unlock_iolock;
+ ASSERT(nimap == 0 || nimap == 1);
+ if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ mod++;
+ if (mod && (mod != mp->m_sb.sb_rextsize))
+ endoffset_fsb -= mod;
+ }
+ }
+ if ((done = (endoffset_fsb <= startoffset_fsb)))
+ /*
+ * One contiguous piece to clear
+ */
+ error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
+ else {
+ /*
+ * Some full blocks, possibly two pieces to clear
+ */
+ if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
+ error = xfs_zero_remaining_bytes(ip, offset,
+ XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
+ if (!error &&
+ XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
+ error = xfs_zero_remaining_bytes(ip,
+ XFS_FSB_TO_B(mp, endoffset_fsb),
+ offset + len - 1);
+ }
+
+ /*
+ * free file space until done or until there is an error
+ */
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ while (!error && !done) {
+
+ /*
+ * allocate and setup the transaction. Allow this
+ * transaction to dip into the reserve blocks to ensure
+ * the freeing of the space succeeds at ENOSPC.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ error = xfs_trans_reserve(tp,
+ resblks,
+ XFS_WRITE_LOG_RES(mp),
+ 0,
+ XFS_TRANS_PERM_LOG_RES,
+ XFS_WRITE_LOG_COUNT);
+
+ /*
+ * check for running out of space
+ */
+ if (error) {
+ /*
+ * Free the transaction structure.
+ */
+ ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ break;
+ }
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota(tp, mp,
+ ip->i_udquot, ip->i_gdquot,
+ resblks, 0, XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto error1;
+
+ xfs_trans_ijoin(tp, ip);
+
+ /*
+ * issue the bunmapi() call to free the blocks
+ */
+ xfs_bmap_init(&free_list, &firstfsb);
+ error = xfs_bunmapi(tp, ip, startoffset_fsb,
+ endoffset_fsb - startoffset_fsb,
+ 0, 2, &firstfsb, &free_list, &done);
+ if (error) {
+ goto error0;
+ }
+
+ /*
+ * complete the transaction
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error) {
+ goto error0;
+ }
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+
+ out_unlock_iolock:
+ if (need_iolock)
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+
+ error0:
+ xfs_bmap_cancel(&free_list);
+ error1:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
+ XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * xfs_change_file_space()
+ * This routine allocates or frees disk space for the given file.
+ * The user specified parameters are checked for alignment and size
+ * limitations.
+ *
+ * RETURNS:
+ * 0 on success
+ * errno on error
+ *
+ */
+int
+xfs_change_file_space(
+ xfs_inode_t *ip,
+ int cmd,
+ xfs_flock64_t *bf,
+ xfs_off_t offset,
+ int attr_flags)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ int clrprealloc;
+ int error;
+ xfs_fsize_t fsize;
+ int setprealloc;
+ xfs_off_t startoffset;
+ xfs_off_t llen;
+ xfs_trans_t *tp;
+ struct iattr iattr;
+ int prealloc_type;
+
+ if (!S_ISREG(ip->i_d.di_mode))
+ return XFS_ERROR(EINVAL);
+
+ switch (bf->l_whence) {
+ case 0: /*SEEK_SET*/
+ break;
+ case 1: /*SEEK_CUR*/
+ bf->l_start += offset;
+ break;
+ case 2: /*SEEK_END*/
+ bf->l_start += ip->i_size;
+ break;
+ default:
+ return XFS_ERROR(EINVAL);
+ }
+
+ llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
+
+ if ( (bf->l_start < 0)
+ || (bf->l_start > XFS_MAXIOFFSET(mp))
+ || (bf->l_start + llen < 0)
+ || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
+ return XFS_ERROR(EINVAL);
+
+ bf->l_whence = 0;
+
+ startoffset = bf->l_start;
+ fsize = ip->i_size;
+
+ /*
+ * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
+ * file space.
+ * These calls do NOT zero the data space allocated to the file,
+ * nor do they change the file size.
+ *
+ * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
+ * space.
+ * These calls cause the new file data to be zeroed and the file
+ * size to be changed.
+ */
+ setprealloc = clrprealloc = 0;
+ prealloc_type = XFS_BMAPI_PREALLOC;
+
+ switch (cmd) {
+ case XFS_IOC_ZERO_RANGE:
+ prealloc_type |= XFS_BMAPI_CONVERT;
+ xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
+ /* FALLTHRU */
+ case XFS_IOC_RESVSP:
+ case XFS_IOC_RESVSP64:
+ error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
+ prealloc_type, attr_flags);
+ if (error)
+ return error;
+ setprealloc = 1;
+ break;
+
+ case XFS_IOC_UNRESVSP:
+ case XFS_IOC_UNRESVSP64:
+ if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
+ attr_flags)))
+ return error;
+ break;
+
+ case XFS_IOC_ALLOCSP:
+ case XFS_IOC_ALLOCSP64:
+ case XFS_IOC_FREESP:
+ case XFS_IOC_FREESP64:
+ if (startoffset > fsize) {
+ error = xfs_alloc_file_space(ip, fsize,
+ startoffset - fsize, 0, attr_flags);
+ if (error)
+ break;
+ }
+
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = startoffset;
+
+ error = xfs_setattr(ip, &iattr, attr_flags);
+
+ if (error)
+ return error;
+
+ clrprealloc = 1;
+ break;
+
+ default:
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+
+ /*
+ * update the inode timestamp, mode, and prealloc flag bits
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+
+ if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
+ 0, 0, 0))) {
+ /* ASSERT(0); */
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin(tp, ip);
+
+ if ((attr_flags & XFS_ATTR_DMI) == 0) {
+ ip->i_d.di_mode &= ~S_ISUID;
+
+ /*
+ * Note that we don't have to worry about mandatory
+ * file locking being disabled here because we only
+ * clear the S_ISGID bit if the Group execute bit is
+ * on, but if it was on then mandatory locking wouldn't
+ * have been enabled.
+ */
+ if (ip->i_d.di_mode & S_IXGRP)
+ ip->i_d.di_mode &= ~S_ISGID;
+
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ }
+ if (setprealloc)
+ ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+ else if (clrprealloc)
+ ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ if (attr_flags & XFS_ATTR_SYNC)
+ xfs_trans_set_sync(tp);
+
+ error = xfs_trans_commit(tp, 0);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ return error;
+}
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
new file mode 100644
index 00000000..3bcd2335
--- /dev/null
+++ b/fs/xfs/xfs_vnodeops.h
@@ -0,0 +1,63 @@
+#ifndef _XFS_VNODEOPS_H
+#define _XFS_VNODEOPS_H 1
+
+struct attrlist_cursor_kern;
+struct file;
+struct iattr;
+struct inode;
+struct iovec;
+struct kiocb;
+struct pipe_inode_info;
+struct uio;
+struct xfs_inode;
+struct xfs_iomap;
+
+
+int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
+#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */
+#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */
+#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
+#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
+#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */
+
+int xfs_readlink(struct xfs_inode *ip, char *link);
+int xfs_release(struct xfs_inode *ip);
+int xfs_inactive(struct xfs_inode *ip);
+int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
+ struct xfs_inode **ipp, struct xfs_name *ci_name);
+int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
+ xfs_dev_t rdev, struct xfs_inode **ipp);
+int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
+ struct xfs_inode *ip);
+int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
+ struct xfs_name *target_name);
+int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
+ xfs_off_t *offset, filldir_t filldir);
+int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
+ const char *target_path, mode_t mode, struct xfs_inode **ipp);
+int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
+int xfs_change_file_space(struct xfs_inode *ip, int cmd,
+ xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
+int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
+ struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+ struct xfs_name *target_name, struct xfs_inode *target_ip);
+int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
+ unsigned char *value, int *valuelenp, int flags);
+int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
+ unsigned char *value, int valuelen, int flags);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
+ int flags, struct attrlist_cursor_kern *cursor);
+int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
+ int flags, struct xfs_iomap *iomapp, int *niomaps);
+void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
+ xfs_off_t last, int fiopt);
+int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
+ xfs_off_t last, int fiopt);
+int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
+ xfs_off_t last, uint64_t flags, int fiopt);
+int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
+
+int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+
+#endif /* _XFS_VNODEOPS_H */